diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6eb9fed679..de3e52f5055 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
 set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
 
 option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
+option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF)
+if (NVFUSER_EXPLICIT_ERROR_CHECK)
+  add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK)
+endif()
 option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)
 
 include(CMakeDependentOption)
@@ -545,6 +549,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_id_model.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp
diff --git a/benchmarks/cpp/batch_norm_channels_first.cpp b/benchmarks/cpp/batch_norm_channels_first.cpp
index 1bc1845d912..22098787766 100644
--- a/benchmarks/cpp/batch_norm_channels_first.cpp
+++ b/benchmarks/cpp/batch_norm_channels_first.cpp
@@ -78,7 +78,7 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -102,7 +102,7 @@ static void NvFuserScheduler_BatchNorm(
   std::vector<c10::IValue> aten_inputs(
       {at_x, at_weight, at_bias, at_run_mean, at_run_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/batch_norm_channels_first_backward.cpp
index 271d04eece9..0edd2d3e52d 100644
--- a/benchmarks/cpp/batch_norm_channels_first_backward.cpp
+++ b/benchmarks/cpp/batch_norm_channels_first_backward.cpp
@@ -89,7 +89,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -115,7 +115,7 @@ static void NvFuserScheduler_BatchNorm_BWD(
   std::vector<c10::IValue> aten_inputs(
       {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/batch_norm_channels_last.cpp b/benchmarks/cpp/batch_norm_channels_last.cpp
index dc21fede5f6..bbdd5c82e63 100644
--- a/benchmarks/cpp/batch_norm_channels_last.cpp
+++ b/benchmarks/cpp/batch_norm_channels_last.cpp
@@ -79,7 +79,7 @@ static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_nhwc(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -103,7 +103,7 @@ static void NvFuserScheduler_BatchNorm_nhwc(
   std::vector<c10::IValue> aten_inputs(
       {at_x, at_weight, at_bias, at_run_mean, at_run_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/batch_norm_channels_last_backward.cpp
index a11627139d4..e40508fffe3 100644
--- a/benchmarks/cpp/batch_norm_channels_last_backward.cpp
+++ b/benchmarks/cpp/batch_norm_channels_last_backward.cpp
@@ -90,7 +90,7 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BatchNorm_nhwc_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -116,7 +116,7 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
   std::vector<c10::IValue> aten_inputs(
       {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/bert.cpp b/benchmarks/cpp/bert.cpp
index d22e234cde1..94edab48479 100644
--- a/benchmarks/cpp/bert.cpp
+++ b/benchmarks/cpp/bert.cpp
@@ -118,7 +118,7 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_DivMaxSoftDropFwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
@@ -135,7 +135,7 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
   std::vector<c10::IValue> at_inputs = {t0, t1};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -143,7 +143,7 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
 
 static void NvFuserScheduler_DivMaxSoftDropBwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto w = benchmark_state.range(0);
   auto x = benchmark_state.range(1);
@@ -162,7 +162,7 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   // Some reason t1 isn't used, ignore it.
   bytes -=
@@ -228,7 +228,7 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -247,7 +247,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -304,7 +304,7 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -322,7 +322,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
   std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -380,7 +380,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -398,7 +398,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
   std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -438,7 +438,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   auto x = benchmark_state.range(0);
   auto y = benchmark_state.range(1);
@@ -454,7 +454,7 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
   std::vector<c10::IValue> at_inputs = {t0, t21};
 
   auto bytes =
-      runBenchmarkIterations(benchmark_state, fusion_executor_cache, at_inputs);
+      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
diff --git a/benchmarks/cpp/broadcast.cpp b/benchmarks/cpp/broadcast.cpp
index c3accd47d2e..6ef7564a6e0 100644
--- a/benchmarks/cpp/broadcast.cpp
+++ b/benchmarks/cpp/broadcast.cpp
@@ -56,7 +56,7 @@ static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {
 
 static void NvFuserScheduler_Broadcast(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int bcast_dim) {
   auto bcast_size = benchmark_state.range(0);
@@ -74,7 +74,7 @@ static void NvFuserScheduler_Broadcast(
 
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp
index 512ea915ae2..ae1e0ce2473 100644
--- a/benchmarks/cpp/gelu_backward.cpp
+++ b/benchmarks/cpp/gelu_backward.cpp
@@ -162,8 +162,8 @@ static void NvFuserScheduler_GeluBackward_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
-    executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+    KernelExecutor ke;
+    ke.compile(&fusion, inputs, heuristic_params->lparams);
   }
 }
 
@@ -187,14 +187,14 @@ static void NvFuserScheduler_GeluBackward_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs, heuristic_params->lparams);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 
   for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
     C10_CUDA_CHECK(cudaDeviceSynchronize());
     clearL2Cache();
   }
@@ -218,11 +218,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs, heuristic_params->lparams);
 
   runBenchmarkIterations(
-      benchmark_state, &executor, inputs, heuristic_params->lparams);
+      benchmark_state, &ke, inputs, heuristic_params->lparams);
 }
 
 BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly)
@@ -247,13 +247,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.setExecuteKernelFlag(false);
-  executor.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.setExecuteKernelFlag(false);
+  ke.compile(&fusion, inputs, heuristic_params->lparams);
 
   for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }
 
diff --git a/benchmarks/cpp/gelu_backward_reduction.cpp b/benchmarks/cpp/gelu_backward_reduction.cpp
index 60ea8ed2b29..c4e97fc6d3b 100644
--- a/benchmarks/cpp/gelu_backward_reduction.cpp
+++ b/benchmarks/cpp/gelu_backward_reduction.cpp
@@ -93,7 +93,7 @@ static void setupGeluBackwardReduction(
 
 static void NvFuserScheduler_GeluBackwardReduction(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int reduction_dim) {
   auto reduction_size = benchmark_state.range(0);
@@ -112,7 +112,7 @@ static void NvFuserScheduler_GeluBackwardReduction(
 
   std::vector<c10::IValue> aten_inputs = {aten_input_grad, aten_input_x};
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // inputs: gradient tensor + input tensor
   // outputs: output, output_of_reduction
diff --git a/benchmarks/cpp/heuristic_cache.cpp b/benchmarks/cpp/heuristic_cache.cpp
index 29d5b13bef2..5ab923bd6d0 100644
--- a/benchmarks/cpp/heuristic_cache.cpp
+++ b/benchmarks/cpp/heuristic_cache.cpp
@@ -26,7 +26,7 @@ using namespace nvfuser;
 
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -84,12 +84,12 @@ static auto getLayerBackwardNormRuntime(
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
@@ -98,14 +98,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
@@ -120,7 +120,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
 
 static auto getLayerForwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -141,11 +141,11 @@ static auto getLayerForwardNormRuntime(
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn(shape, options);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormForward_HeuristicCache(
@@ -154,14 +154,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicCache(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
diff --git a/benchmarks/cpp/heuristic_lookup.cpp b/benchmarks/cpp/heuristic_lookup.cpp
index aecc7dc824f..16be106b728 100644
--- a/benchmarks/cpp/heuristic_lookup.cpp
+++ b/benchmarks/cpp/heuristic_lookup.cpp
@@ -26,7 +26,7 @@ using namespace nvfuser;
 
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime(
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
@@ -100,14 +100,14 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
@@ -122,7 +122,7 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
 
 static auto getLayerForwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -143,11 +143,11 @@ static auto getLayerForwardNormRuntime(
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn(shape, options);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
@@ -156,14 +156,14 @@ static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
diff --git a/benchmarks/cpp/indexselect.cpp b/benchmarks/cpp/indexselect.cpp
index ba5c7054cab..01eefc2d0a1 100644
--- a/benchmarks/cpp/indexselect.cpp
+++ b/benchmarks/cpp/indexselect.cpp
@@ -132,8 +132,8 @@ static void NvFuserScheduler_IndexSelect_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
-    executor.compileFusion(
+    KernelExecutor ke;
+    ke.compile(
         &fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }
@@ -155,8 +155,8 @@ static void NvFuserScheduler_IndexSelect_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
@@ -164,7 +164,7 @@ static void NvFuserScheduler_IndexSelect_RunFusion(
   at::Tensor output = at::empty_like(inputs[0].toTensor());
 
   for (auto _ : benchmark_state) {
-    executor.runFusion(
+    ke.run(
         c10::ArrayRef<c10::IValue>(inputs),
         {output},
         heuristic_params->lparams);
@@ -235,7 +235,7 @@ static void setupIndexSelect(Fusion* fusion, DataType dtype, int select_dim) {
 
 static void NvFuserScheduler_IndexSelectSimple(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int select_dim) {
   auto elem_size = benchmark_state.range(0);
@@ -257,7 +257,7 @@ static void NvFuserScheduler_IndexSelectSimple(
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
@@ -267,7 +267,7 @@ static void NvFuserScheduler_IndexSelectSimple(
 
 static void NvFuserScheduler_IndexSelect(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int select_dim) {
   auto elem_size = benchmark_state.range(0);
@@ -289,7 +289,7 @@ static void NvFuserScheduler_IndexSelect(
 
   std::vector<c10::IValue> aten_inputs = {t2, t0, t1};
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/instance_norm.cpp b/benchmarks/cpp/instance_norm.cpp
index d4c6707e912..2f3a832d8db 100644
--- a/benchmarks/cpp/instance_norm.cpp
+++ b/benchmarks/cpp/instance_norm.cpp
@@ -81,7 +81,7 @@ static void setupInstanceNorm(
 
 static void NvFuserScheduler_InstanceNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     bool channels_last_3d = false) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
@@ -116,7 +116,7 @@ static void NvFuserScheduler_InstanceNorm(
       at_x, at_weight, at_bias, at_mean, at_var};
   std::vector<at::Tensor> outputs;
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   const size_t kChannels = benchmark_state.range(2);
 
@@ -165,7 +165,7 @@ static void setupInstanceNormNHWC(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_InstanceNormNHWC(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -186,7 +186,7 @@ static void NvFuserScheduler_InstanceNormNHWC(
   std::vector<c10::IValue> aten_inputs = {at_x, at_weight, at_bias};
   std::vector<at::Tensor> outputs;
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   const size_t kChannels = benchmark_state.range(2);
 
diff --git a/benchmarks/cpp/layer_norm.cpp b/benchmarks/cpp/layer_norm.cpp
index 445b1637274..706f1e8fa84 100644
--- a/benchmarks/cpp/layer_norm.cpp
+++ b/benchmarks/cpp/layer_norm.cpp
@@ -67,7 +67,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_LayerNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -84,7 +84,7 @@ static void NvFuserScheduler_LayerNorm(
 
   std::vector<c10::IValue> aten_inputs({input, weight, bias});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
@@ -142,7 +142,7 @@ static void Baseline_LayerNorm_fp16(benchmark::State& benchmark_state) {
 
 static void NvFuserScheduler_TIMM_LayerNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -162,7 +162,7 @@ static void NvFuserScheduler_TIMM_LayerNorm(
 
   std::vector<c10::IValue> aten_inputs({input, weight, bias});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/layer_norm_backward.cpp b/benchmarks/cpp/layer_norm_backward.cpp
index a14e89fce4f..3da9de991dc 100644
--- a/benchmarks/cpp/layer_norm_backward.cpp
+++ b/benchmarks/cpp/layer_norm_backward.cpp
@@ -80,7 +80,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_LayerNorm_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
 
@@ -103,7 +103,7 @@ static void NvFuserScheduler_LayerNorm_BWD(
   std::vector<c10::IValue> aten_inputs(
       {grad_out, input, weight, bias, mean, rstd});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/layer_norm_fused.cpp b/benchmarks/cpp/layer_norm_fused.cpp
index 823b571aa1c..12cba780d4b 100644
--- a/benchmarks/cpp/layer_norm_fused.cpp
+++ b/benchmarks/cpp/layer_norm_fused.cpp
@@ -84,7 +84,7 @@ static void setupLayerNormFused(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_LayerNormFused(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(dtype == DataType::Half);
 
@@ -104,7 +104,7 @@ static void NvFuserScheduler_LayerNormFused(
 
   std::vector<c10::IValue> aten_inputs({tv0, tv1, tv2, tv3, tv4});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/lstm_cell.cpp b/benchmarks/cpp/lstm_cell.cpp
index 3c7b98a4c84..7fe205f6312 100644
--- a/benchmarks/cpp/lstm_cell.cpp
+++ b/benchmarks/cpp/lstm_cell.cpp
@@ -155,8 +155,8 @@ static void NvFuserScheduler_LstmCell_Compile(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   for (auto _ : benchmark_state) {
-    FusionExecutor executor;
-    executor.compileFusion(&fusion, inputs);
+    KernelExecutor ke;
+    ke.compile(&fusion, inputs);
   }
 }
 
@@ -182,14 +182,14 @@ static void NvFuserScheduler_LstmCell_RunFusion(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 
   for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
     C10_CUDA_CHECK(cudaDeviceSynchronize());
   }
 }
@@ -220,11 +220,11 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
 
   runBenchmarkIterations(
-      benchmark_state, &executor, inputs, heuristic_params->lparams);
+      benchmark_state, &ke, inputs, heuristic_params->lparams);
 }
 
 BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_GpuOnly, Small, 512, 64)
@@ -259,13 +259,13 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly(
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
-  FusionExecutor executor;
-  executor.setExecuteKernelFlag(false);
-  executor.compileFusion(&fusion, inputs);
+  KernelExecutor ke;
+  ke.setExecuteKernelFlag(false);
+  ke.compile(&fusion, inputs);
 
   for (auto _ : benchmark_state) {
-    outputs = executor.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }
 
diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp
index 4f93dfbaf62..a9b10655aa0 100644
--- a/benchmarks/cpp/matmul.cpp
+++ b/benchmarks/cpp/matmul.cpp
@@ -175,19 +175,19 @@ static void SingleMatmulBase(
 
   // Compile kernel
   auto launch_constraints = LaunchParams();
-  FusionExecutor fe;
-  fe.compileFusion(fusion, args, launch_constraints, cparams);
+  KernelExecutor ke;
+  ke.compile(fusion, args, launch_constraints, cparams);
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), launch_constraints).empty(),
+      getBankConflictInfo(ke.kernel(), launch_constraints).empty(),
       "Shared memory bank conflict not removed.");
 
   std::vector<c10::IValue> aten_inputs({inputs.first, inputs.second});
 
   // Warm up run
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), k);
 
-  runBenchmarkIterations(benchmark_state, &fe, aten_inputs);
+  runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
   // TODO: FLOPS calculation
 }
@@ -355,19 +355,19 @@ static void SingleMatmulPartitionedK(
   cparams.index_type = computeIndexType(M, N, K);
 
   // Compile kernel
-  FusionExecutor fe;
+  KernelExecutor ke;
   auto lparams = LaunchParams();
-  fe.compileFusion(fusion, args, lparams, cparams);
+  ke.compile(fusion, args, lparams, cparams);
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), lparams).empty(),
+      getBankConflictInfo(ke.kernel(), lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki);
 
-  runBenchmarkIterations(benchmark_state, &fe, aten_inputs);
+  runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
   // TODO: FLOPS calculation
 }
@@ -461,21 +461,21 @@ static void NvFuserScheduler_MatmulSplitKReduction(
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
 
   // Compile kernel
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       fusion, args, heuristic_params->lparams, heuristic_params->cparams);
 
   NVF_CHECK(
-      getBankConflictInfo(fe.kernel(), heuristic_params->lparams).empty(),
+      getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
-  auto outputs = fe.runFusion(aten_inputs, heuristic_params->lparams);
+  auto outputs = ke.run(aten_inputs, heuristic_params->lparams);
 
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor);
 
   runBenchmarkIterations(
-      benchmark_state, &fe, aten_inputs, heuristic_params->lparams);
+      benchmark_state, &ke, aten_inputs, heuristic_params->lparams);
 
   // TODO: FLOPS calculation
 }
diff --git a/benchmarks/cpp/reduction.cpp b/benchmarks/cpp/reduction.cpp
index f70fb931e84..84f378967ca 100644
--- a/benchmarks/cpp/reduction.cpp
+++ b/benchmarks/cpp/reduction.cpp
@@ -50,7 +50,7 @@ static void setupReduction(Fusion* fusion, DataType dtype, int red_axis) {
 
 static void NvFuserScheduler_Reduction(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int reduction_dim) {
   auto reduction_size = benchmark_state.range(0);
@@ -65,7 +65,7 @@ static void NvFuserScheduler_Reduction(
 
   std::vector<c10::IValue> aten_inputs({aten_input});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/rms_norm.cpp b/benchmarks/cpp/rms_norm.cpp
index 1c29c66a631..6085929c179 100644
--- a/benchmarks/cpp/rms_norm.cpp
+++ b/benchmarks/cpp/rms_norm.cpp
@@ -62,7 +62,7 @@ static void setupRMSNorm(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_RMSNorm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(
       dtype == DataType::Float || dtype == DataType::Half ||
@@ -80,7 +80,7 @@ static void NvFuserScheduler_RMSNorm(
 
   std::vector<c10::IValue> aten_inputs({input, weight});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/rms_norm_backward.cpp b/benchmarks/cpp/rms_norm_backward.cpp
index 357499f2be4..c9422846b19 100644
--- a/benchmarks/cpp/rms_norm_backward.cpp
+++ b/benchmarks/cpp/rms_norm_backward.cpp
@@ -69,7 +69,7 @@ static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_RMSNorm_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   NVF_ERROR(
       dtype == DataType::Float || dtype == DataType::Half ||
@@ -89,7 +89,7 @@ static void NvFuserScheduler_RMSNorm_BWD(
 
   std::vector<c10::IValue> aten_inputs({grad_out, input, weight, rstd});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/scale_bias_relu.cpp b/benchmarks/cpp/scale_bias_relu.cpp
index e68c0b9140d..ed1505a2884 100644
--- a/benchmarks/cpp/scale_bias_relu.cpp
+++ b/benchmarks/cpp/scale_bias_relu.cpp
@@ -114,7 +114,7 @@ static void setupSBRNorm(Fusion* fusion, DataType dtype) {
 
 static void NvFuserScheduler_SBR(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   // N, H, W, C format
   std::vector<int64_t> input_shape{
@@ -136,7 +136,7 @@ static void NvFuserScheduler_SBR(
   // inputs
   std::vector<c10::IValue> aten_inputs = {at_x, at_scale, at_bias};
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   const size_t size =
       input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
@@ -191,7 +191,7 @@ static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) {
 
 static void NvFuserScheduler_SBR_Norm(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype) {
   // N, H, W, C format
   std::vector<int64_t> input_shape{
@@ -215,7 +215,7 @@ static void NvFuserScheduler_SBR_Norm(
   std::vector<c10::IValue> aten_inputs = {
       at_x, at_weight, at_bias, at_mean, at_var};
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   const size_t size =
       input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
diff --git a/benchmarks/cpp/shape_inference.cpp b/benchmarks/cpp/shape_inference.cpp
index 801759e2c03..3e580b4e6b4 100644
--- a/benchmarks/cpp/shape_inference.cpp
+++ b/benchmarks/cpp/shape_inference.cpp
@@ -26,7 +26,7 @@ using namespace nvfuser;
 
 static auto getLayerBackwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -86,12 +86,12 @@ static auto getLayerBackwardNormRuntime(
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 void LayerNormBackward_ShapeInference_Base(
@@ -101,30 +101,30 @@ void LayerNormBackward_ShapeInference_Base(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerBackwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
 
   NVF_ERROR(runtime->getMaybeHeuristicsFor(args).has_value());
 
-  fec->profile(true);
-  fec->disableKernelLaunch();
-  fec->runFusionWithInputs(aten_inputs);
+  executor_cache->profile(true);
+  executor_cache->disableKernelLaunch();
+  executor_cache->runFusionWithInputs(aten_inputs);
   if (disable_launch_parameter_cache) {
-    fec->disableLaunchParamCache();
+    executor_cache->disableLaunchParamCache();
   }
 
   for (auto _ : benchmark_state) {
     // Setup (not included in the measurement)
-    fec->runFusionWithInputs(aten_inputs);
+    executor_cache->runFusionWithInputs(aten_inputs);
   }
 }
 
@@ -140,7 +140,7 @@ static void NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline(
 
 static auto getLayerForwardNormRuntime(
     std::unique_ptr<Fusion> fusion_ptr,
-    std::unique_ptr<FusionExecutorCache>& fec,
+    std::unique_ptr<FusionExecutorCache>& executor_cache,
     std::vector<c10::IValue>& aten_inputs,
     std::vector<int64_t>& shape,
     std::vector<int64_t>& norm_shape) {
@@ -161,11 +161,11 @@ static auto getLayerForwardNormRuntime(
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn(shape, options);
 
-  fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
+  executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
   aten_inputs = {aten_input};
-  auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
 
-  return fec->getMostRecentKernelRuntime();
+  return executor_cache->getMostRecentKernelRuntime();
 }
 
 void LayerNormForward_ShapeInferenceBase(
@@ -175,31 +175,31 @@ void LayerNormForward_ShapeInferenceBase(
   FusionGuard fg(fusion_ptr.get());
 
   // PreAllocate
-  std::unique_ptr<FusionExecutorCache> fec;
+  std::unique_ptr<FusionExecutorCache> executor_cache;
   std::vector<c10::IValue> aten_inputs;
 
   std::vector<int64_t> shape{20, 100, 35, 67};
   std::vector<int64_t> norm_shape{67};
 
   auto runtime = getLayerForwardNormRuntime(
-      std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
+      std::move(fusion_ptr), executor_cache, aten_inputs, shape, norm_shape);
 
   KernelArgumentHolder args =
       KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
 
   NVF_ERROR(runtime->getMaybeHeuristicsFor(args).has_value());
 
-  fec->profile(true);
-  fec->disableKernelLaunch();
-  fec->runFusionWithInputs(aten_inputs);
+  executor_cache->profile(true);
+  executor_cache->disableKernelLaunch();
+  executor_cache->runFusionWithInputs(aten_inputs);
 
   if (disable_launch_param_cache) {
-    fec->disableLaunchParamCache();
+    executor_cache->disableLaunchParamCache();
   }
 
   for (auto _ : benchmark_state) {
     // Setup (not included in the measurement)
-    fec->runFusionWithInputs(aten_inputs);
+    executor_cache->runFusionWithInputs(aten_inputs);
   }
 }
 
diff --git a/benchmarks/cpp/softmax.cpp b/benchmarks/cpp/softmax.cpp
index f1dca672349..90b30c9ff54 100644
--- a/benchmarks/cpp/softmax.cpp
+++ b/benchmarks/cpp/softmax.cpp
@@ -52,7 +52,7 @@ static void setupSoftmax(
 
 static void NvFuserScheduler_Softmax(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     const int reduction_axis) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
@@ -70,7 +70,7 @@ static void NvFuserScheduler_Softmax(
 
   std::vector<c10::IValue> aten_inputs({aten_input});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
@@ -105,10 +105,10 @@ static void NvFuserScheduler_Softmax_WarpReduceReference(
   auto heuristic_params = scheduler->computeHeuristics(fusion, runtime_info);
   scheduler->schedule(fusion, heuristic_params.get());
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion, aten_inputs);
 
-  runBenchmarkIterations(benchmark_state, &fe, aten_inputs);
+  runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
@@ -152,10 +152,10 @@ static void NvFuserScheduler_Softmax_WarpReduce(
     }
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion, aten_inputs);
 
-  runBenchmarkIterations(benchmark_state, &fe, aten_inputs);
+  runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/softmax_backward.cpp b/benchmarks/cpp/softmax_backward.cpp
index 8c4a84562cc..364a5246016 100644
--- a/benchmarks/cpp/softmax_backward.cpp
+++ b/benchmarks/cpp/softmax_backward.cpp
@@ -57,7 +57,7 @@ static void setupSoftmaxBWD(
 
 static void NvFuserScheduler_Softmax_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     const int reduction_axis) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
@@ -83,7 +83,7 @@ static void NvFuserScheduler_Softmax_BWD(
 
   std::vector<c10::IValue> aten_inputs({grad_output, output, input});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/softmax_dropout.cpp b/benchmarks/cpp/softmax_dropout.cpp
index f43fa24da81..2999d6442f1 100644
--- a/benchmarks/cpp/softmax_dropout.cpp
+++ b/benchmarks/cpp/softmax_dropout.cpp
@@ -75,7 +75,7 @@ static void setupSoftmaxDropout(
 
 static void NvFuserScheduler_SoftmaxDropout(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     const int kReductionAxis) {
   NVF_ERROR(dtype == DataType::Float || dtype == DataType::Half);
@@ -96,7 +96,7 @@ static void NvFuserScheduler_SoftmaxDropout(
   std::vector<c10::IValue> aten_inputs(
       {at_scores, at_mask, sqrt(kAttentionHeadSize)});
 
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // 5 dtype: attention_scores + attention_mask + attention_scores_out +
   // attention_probs_out + output
diff --git a/benchmarks/cpp/timm.cpp b/benchmarks/cpp/timm.cpp
index 8bffc0bd1ef..cac01eadcca 100644
--- a/benchmarks/cpp/timm.cpp
+++ b/benchmarks/cpp/timm.cpp
@@ -56,7 +56,7 @@ static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) {
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -74,7 +74,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7(
   auto t7 = at::randn(input_shape, fp16_options);
 
   std::vector<c10::IValue> aten_inputs({t2, t3, t4, t7});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // full tensor - float + halfx2 - t2, t7, t39
   // Inner most dimension only - floatx2 - t36, t37
@@ -170,7 +170,7 @@ static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -189,7 +189,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5(
   auto t1 = at::randn({input_shape[2]}, fp32_options);
 
   std::vector<c10::IValue> aten_inputs({t2, t5, t3, t0, t1});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16
   // Inner most dim only - floatx5 - t5, t0, t1, t7, t17
@@ -236,7 +236,7 @@ static void setup_vit_base_patch16_224_bcast_outer2(
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -252,7 +252,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2(
   auto t2 = at::randn({input_shape[2]}, fp32_options);
 
   std::vector<c10::IValue> aten_inputs({t0, t2});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // full tensor - halfx2 - t0, t6
   // inner dimension only - halfx2 - t2, t7
@@ -314,7 +314,7 @@ static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) {
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -328,7 +328,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3(
   auto t0 = at::randn(input_shape, fp16_options);
 
   std::vector<c10::IValue> aten_inputs({t0, 0.125});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14
   benchmark_state.SetBytesProcessed(
@@ -391,7 +391,7 @@ static void setup_vit_base_patch16_224_bcast_outer6(
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -407,7 +407,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6(
   auto t2 = at::randn({input_shape[2]}, fp32_options);
 
   std::vector<c10::IValue> aten_inputs({t0, t2});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
   // full tensors - float, halfx2, bool - t6, t0, t18, t13
   // inner dimension only - float, half - t2, t19
   benchmark_state.SetBytesProcessed(
@@ -480,7 +480,7 @@ static void setup_vit_base_patch16_224_bcast_inner6(
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -496,7 +496,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6(
   auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options);
 
   std::vector<c10::IValue> aten_inputs({t0, t2});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // full tensors - float, halfx2, bool - t6, t0, t18, t13
   // outer two dimensions only - float, half - t2, t19
@@ -620,7 +620,7 @@ static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) {
 
 static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -641,7 +641,7 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD(
   auto t9 = at::randn({input_shape[2]}, fp16_options);
 
   std::vector<c10::IValue> aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // Full tensors - bool, halfx4 - t0, t1, t3, t34, t35
   // Outer two dimensions - floatx2 - t5, t6
@@ -701,7 +701,7 @@ static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) {
 
 static void NvFuserScheduler_nhwc_seresnet152d_transpose65(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     void* null) {
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
@@ -721,7 +721,7 @@ static void NvFuserScheduler_nhwc_seresnet152d_transpose65(
   auto t4 = at::randn({2}, fp16_options).sum();
 
   std::vector<c10::IValue> aten_inputs({t2, t5, t7, t9, t4});
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
 
   // Full tensors - halfx6 - t2, t5, t7, t9, t29, t30
   benchmark_state.SetBytesProcessed(
diff --git a/benchmarks/cpp/transpose.cpp b/benchmarks/cpp/transpose.cpp
index b24a2fdbfbe..21389f49f36 100644
--- a/benchmarks/cpp/transpose.cpp
+++ b/benchmarks/cpp/transpose.cpp
@@ -108,7 +108,7 @@ static void setupTranspose(
 
 static void NvFuserScheduler_Transpose(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     DataType dtype,
     int num_dims,
     std::pair<int, int> axes,
@@ -125,7 +125,7 @@ static void NvFuserScheduler_Transpose(
   auto at_input2 = aten_inputs[1];
 
   std::vector<c10::IValue> fuser_inputs = {at_input1, at_input2};
-  runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, fuser_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp
index e171badd9ae..613a3cbb2ef 100644
--- a/benchmarks/cpp/utils.cpp
+++ b/benchmarks/cpp/utils.cpp
@@ -170,27 +170,27 @@ int64_t getSizeOfOutputs(const std::vector<at::Tensor>& outputs) {
 
 int64_t runBenchmarkIterations(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     std::vector<c10::IValue>& aten_inputs) {
   c10::cuda::CUDACachingAllocator::emptyCache();
-  fusion_executor_cache->profile(true);
+  executor_cache->profile(true);
 
   int64_t io_bytes = getSizeOfInputs(aten_inputs);
 
   // Segment and compile the fusion
   {
-    auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
+    auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
     io_bytes += getSizeOfOutputs(cg_outputs);
   }
 
   bool segmented =
-      fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() &&
-      fusion_executor_cache->getMostRecentKernelRuntime()
+      executor_cache->getMostRecentKernelRuntime()->isSegmented() &&
+      executor_cache->getMostRecentKernelRuntime()
               ->fusionSegments()
               ->groups()
               .size() > 1;
 
-  const auto& compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
+  const auto& compile_log = executor_cache->getMostRecentExecutorInfo();
   auto params = toString(compile_log.params);
   auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
   // Only set if not segmented. In the case of segmented fusions,
@@ -200,7 +200,7 @@ int64_t runBenchmarkIterations(
     benchmark_state.SetLabel(params + lparams);
   }
 
-  fusion_executor_cache->profile(false);
+  executor_cache->profile(false);
 
   // Sync everything up before we start
   NVFUSER_CUDA_RT_SAFE_CALL(cudaDeviceSynchronize());
@@ -208,7 +208,7 @@ int64_t runBenchmarkIterations(
 
   for (auto _ : benchmark_state) {
     clearL2Cache();
-    auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
+    auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);
     benchmark_state.SetIterationTime(
         FusionProfiler::profile().kernel_time_ms / 1000.0);
   }
@@ -223,15 +223,15 @@ int64_t runBenchmarkIterations(
 
 int64_t runBenchmarkIterations(
     benchmark::State& benchmark_state,
-    FusionExecutor* fusion_executor,
+    KernelExecutor* fusion_executor,
     std::vector<c10::IValue>& aten_inputs,
     const LaunchParams& launch_constraints,
     CompileParams compile_params) {
   int64_t io_bytes = getSizeOfInputs(aten_inputs);
   {
     // Warm-up run
-    auto cg_outputs = fusion_executor->runFusion(
-        aten_inputs, launch_constraints, compile_params);
+    auto cg_outputs =
+        fusion_executor->run(aten_inputs, launch_constraints, compile_params);
     io_bytes += getSizeOfOutputs(cg_outputs);
   }
 
@@ -246,8 +246,8 @@ int64_t runBenchmarkIterations(
     clearL2Cache();
     FusionProfiler::start();
     FusionProfiler::createSegments(1);
-    auto cg_outputs = fusion_executor->runFusion(
-        aten_inputs, launch_constraints, compile_params);
+    auto cg_outputs =
+        fusion_executor->run(aten_inputs, launch_constraints, compile_params);
     FusionProfiler::stop();
     benchmark_state.SetIterationTime(
         FusionProfiler::profile().kernel_time_ms / 1000.0);
diff --git a/benchmarks/cpp/utils.h b/benchmarks/cpp/utils.h
index 61c5e556af3..67beb1ca7d5 100644
--- a/benchmarks/cpp/utils.h
+++ b/benchmarks/cpp/utils.h
@@ -40,7 +40,7 @@ std::string toString(LaunchParams lparams);
 //! if not segmented.
 int64_t runBenchmarkIterations(
     benchmark::State& benchmark_state,
-    FusionExecutorCache* fusion_executor_cache,
+    FusionExecutorCache* executor_cache,
     std::vector<c10::IValue>& aten_inputs);
 
 //! Run benchmark iterations with a fusion executor and
@@ -48,7 +48,7 @@ int64_t runBenchmarkIterations(
 //! kernel time is added to benchmark_state.
 int64_t runBenchmarkIterations(
     benchmark::State& benchmark_state,
-    FusionExecutor* fusion_executor,
+    KernelExecutor* fusion_executor,
     std::vector<c10::IValue>& aten_inputs,
     const LaunchParams& launch_constraints = LaunchParams(),
     CompileParams compile_params = CompileParams());
diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
index 8932afbff30..03adbe1e7dd 100644
--- a/benchmarks/python/conftest.py
+++ b/benchmarks/python/conftest.py
@@ -96,45 +96,39 @@ def pytest_configure(config):
 
 def pytest_collection_modifyitems(session, config, items):
     """
-    The baseline benchmarks use `compile` parameter:
-        compile = false: Eager mode benchmark
-        compile = true: torch.compile benchmark
+    The baseline benchmarks use `executor` parameter with
+    values ["eager", "torchcompile", "thunder"] that are optionally
+    run using `--benchmark-{executor}` flag. They are skipped by
+    default.
     """
-    run_eager = config.getoption("--benchmark-eager")
-    run_thunder = config.getoption("--benchmark-thunder")
-    run_torchcompile = config.getoption("--benchmark-torchcompile")
 
     from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
 
+    executors = ["eager", "torchcompile", "thunder"]
+
+    def get_test_executor(item) -> str | None:
+        if hasattr(item, "callspec") and "executor" in item.callspec.params:
+            test_executor = item.callspec.params["executor"]
+            assert (
+                test_executor in executors
+            ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
+            return test_executor
+        return None
+
+    executors_to_skip = []
+
+    for executor in executors:
+        if not config.getoption(f"--benchmark-{executor}"):
+            executors_to_skip.append(executor)
+
     for item in items:
         item.obj = retry_on_oom_or_skip_test(item.obj)
 
-    if not run_eager:
-        skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run")
-        for item in items:
-            # If the benchmark has compile=False parameter (eager mode), skip it.
-            if (
-                hasattr(item, "callspec")
-                and "compile" in item.callspec.params
-                and not item.callspec.params["compile"]
-            ):
-                item.add_marker(skip_eager)
-
-    if not run_torchcompile:
-        skip_torchcompile = pytest.mark.skip(
-            reason="need --benchmark-torchcompile option to run"
-        )
-        for item in items:
-            # If the benchmark has compile=True parameter (torch.compile mode), skip it.
-            if (
-                hasattr(item, "callspec")
-                and "compile" in item.callspec.params
-                and item.callspec.params["compile"]
-            ):
-                item.add_marker(skip_torchcompile)
-
-    if not run_thunder:
-        skip_thunder = pytest.mark.skip(reason="need --benchmark-thunder option to run")
-        for item in items:
-            if "thunder" in item.nodeid:
-                item.add_marker(skip_thunder)
+        test_executor = get_test_executor(item)
+
+        if test_executor is not None and test_executor in executors_to_skip:
+            item.add_marker(
+                pytest.mark.skip(
+                    reason=f"need --benchmark-{test_executor} option to run."
+                )
+            )
diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
index 8cbafe81353..6d493338846 100644
--- a/benchmarks/python/normalization.py
+++ b/benchmarks/python/normalization.py
@@ -433,10 +433,10 @@ def norm_fwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     channels_last: bool,
-    compile: bool,
+    executor: str,
     norm: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
@@ -453,10 +453,12 @@ def norm_fwd_baseline_benchmark(
 
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
+    benchmark_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(norm_fwd_fn) if compile else norm_fwd_fn,
+        benchmark_fn[executor],
         [inputs, weight, bias, running_mean, running_var],
         iobytes=norm_fwd_iobytes(size, dtype, norm),
     )
@@ -467,10 +469,10 @@ def norm_bwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     channels_last: bool,
-    compile: bool,
+    executor: str,
     norm: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     assert norm in ["batch_norm", "instance_norm"], NotImplementedError
@@ -491,13 +493,13 @@ def norm_bwd_baseline_benchmark(
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
     # Compile the fwd fn for torchcompile
-    norm_fwd_fn = torch.compile(norm_fwd_fn) if compile else norm_fwd_fn
-    output = norm_fwd_fn([inputs, weight, bias, running_mean, running_var])
+    fwd_fn = {"eager": norm_fwd_fn, "torchcompile": torch.compile(norm_fwd_fn)}
+    outputs = fwd_fn[executor]([inputs, weight, bias, running_mean, running_var])
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=norm_bwd_iobytes(size, dtype, norm),
     )
diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py
index 74242ba99e2..0a1cd64cc57 100644
--- a/benchmarks/python/test_batchnorm_bwd.py
+++ b/benchmarks/python/test_batchnorm_bwd.py
@@ -31,13 +31,13 @@ def test_batchnorm_bwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_batchnorm_bwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_bwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "batch_norm"
+        benchmark, size, dtype, channels_last, executor, "batch_norm"
     )
diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py
index 47b3997770a..af197ce6f1b 100644
--- a/benchmarks/python/test_batchnorm_fwd.py
+++ b/benchmarks/python/test_batchnorm_fwd.py
@@ -31,13 +31,13 @@ def test_batchnorm_fwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_batchnorm_fwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_fwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "batch_norm"
+        benchmark, size, dtype, channels_last, executor, "batch_norm"
     )
diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py
index abb320ef2a3..65db1555b28 100644
--- a/benchmarks/python/test_broadcast_add_fwd.py
+++ b/benchmarks/python/test_broadcast_add_fwd.py
@@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [bias, x])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"])
@@ -101,9 +101,9 @@ def test_bcast_add_baseline_benchmark(
     dtype: torch.dtype,
     bcast_axis: int,
     contiguous: bool,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     bias = torch.randn(size[1 - bcast_axis], dtype=dtype, device="cuda")
     input_shape = size if contiguous else (size[1], size[0])
@@ -112,9 +112,14 @@ def test_bcast_add_baseline_benchmark(
         x = x.t()
     assert x.is_contiguous() == contiguous
 
+    benchmark_fn = {
+        "eager": bcast_add_fwd_fn,
+        "torchcompile": torch.compile(bcast_add_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(bcast_add_fwd_fn) if compile else bcast_add_fwd_fn,
+        benchmark_fn[executor],
         [bias, x, bcast_axis],
     )
diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
index 6acaa012c5c..380a2085b09 100644
--- a/benchmarks/python/test_dropout_layernorm_bwd.py
+++ b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -189,16 +189,16 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
         )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_layernorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     dropout_p = 0.2
@@ -217,13 +217,16 @@ def dropout_layernorm_fwd():
         )
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": dropout_layernorm_fwd,
+        "torchcompile": torch.compile(dropout_layernorm_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=dropout_layernorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
index 47854fcd2d7..4408a2bd611 100644
--- a/benchmarks/python/test_dropout_layernorm_fwd.py
+++ b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -160,16 +160,16 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_layernorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     dropout_p = 0.2
@@ -181,10 +181,15 @@ def test_dropout_layernorm_fwd_baseline_benchmark(
         dropout_p,
     ]
 
+    benchmark_fn = {
+        "eager": dropout_layernorm_fwd,
+        "torchcompile": torch.compile(dropout_layernorm_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(dropout_layernorm_fwd) if compile else dropout_layernorm_fwd,
+        benchmark_fn[executor],
         inputs,
         iobytes=dropout_layernorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py
index 8c61c51e2d9..d196e76f57b 100644
--- a/benchmarks/python/test_dropout_rmsnorm_bwd.py
+++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py
@@ -169,16 +169,16 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
         )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_rmsnorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     dropout_p = 0.2
     input1 = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
@@ -191,12 +191,15 @@ def dropout_rmsnorm_fwd():
         output = weights * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + 1e-5)
         return output
 
-    fwd_fn = torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": dropout_rmsnorm_fwd,
+        "torchcompile": torch.compile(dropout_rmsnorm_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=dropout_rmsnorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
index a93a8caf547..aea2674df9d 100644
--- a/benchmarks/python/test_dropout_rmsnorm_fwd.py
+++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -145,16 +145,16 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [input1, input2, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_rmsnorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     dropout_p = 0.2
 
@@ -165,10 +165,15 @@ def test_dropout_rmsnorm_fwd_baseline_benchmark(
         dropout_p,
     ]
 
+    benchmark_fn = {
+        "eager": dropout_rmsnorm_fwd,
+        "torchcompile": torch.compile(dropout_rmsnorm_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(dropout_rmsnorm_fwd) if compile else dropout_rmsnorm_fwd,
+        benchmark_fn[executor],
         inputs,
         iobytes=dropout_rmsnorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py
index 648f0317cf9..ffd0b25c6a2 100644
--- a/benchmarks/python/test_gelu_bwd.py
+++ b/benchmarks/python/test_gelu_bwd.py
@@ -88,16 +88,16 @@ def test_gelu_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, bias])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
@@ -106,12 +106,15 @@ def test_gelu_bwd_baseline_benchmark(
     def gelu_fwd():
         return torch.nn.functional.gelu(inputs + bias, approximate="tanh")
 
-    fwd_fn = torch.compile(gelu_fwd) if compile else gelu_fwd
-    eager_output = fwd_fn()
+    fwd_fn = {
+        "eager": gelu_fwd,
+        "torchcompile": torch.compile(gelu_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [eager_output, grads],
+        [outputs, grads],
         iobytes=gelu_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
index 09dfd53d88a..e860826eb49 100644
--- a/benchmarks/python/test_gelu_bwd_reduction.py
+++ b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -103,7 +103,7 @@ def test_gelu_bwd_reduction_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, bias])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -112,19 +112,23 @@ def test_gelu_bwd_reduction_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     grads = torch.randn(size, device="cuda", dtype=dtype)
     eager_output = torch.nn.functional.gelu(inputs + bias, approximate="tanh")
+
+    benchmark_fn = {
+        "eager": gelu_bwd_reduction_torch,
+        "torchcompile": torch.compile(gelu_bwd_reduction_torch),
+    }
+
     run_benchmark(
         benchmark,
-        torch.compile(gelu_bwd_reduction_torch)
-        if compile
-        else gelu_bwd_reduction_torch,
+        benchmark_fn[executor],
         [eager_output, grads, inputs, reduction_axis],
         iobytes=gelu_bwd_reduction_iobytes(size, dtype, reduction_axis),
     )
diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py
index fa5f891ef8a..2f208b2c090 100644
--- a/benchmarks/python/test_gelu_fwd.py
+++ b/benchmarks/python/test_gelu_fwd.py
@@ -67,22 +67,26 @@ def test_gelu_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = [
         torch.randn(size, device="cuda", dtype=dtype, requires_grad=True),  # in_tensor
         torch.ones(size[-1], device="cuda", dtype=dtype),  # bias
     ]
+
+    benchmark_fn = {
+        "eager": gelu_fwd_fn,
+        "torchcompile": torch.compile(gelu_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
-    run_benchmark(
-        benchmark, torch.compile(gelu_fwd_fn) if compile else gelu_fwd_fn, inputs
-    )
+    run_benchmark(benchmark, benchmark_fn[executor], inputs)
diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py
index af4c023d7d7..8c729e115d7 100644
--- a/benchmarks/python/test_groupnorm_fwd.py
+++ b/benchmarks/python/test_groupnorm_fwd.py
@@ -128,35 +128,16 @@ def test_groupnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [x, weight, bias])
 
 
-@pytest.mark.parametrize("size", generate_input_sizes(dims=4))
-@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
-def test_groupnorm_fwd_thunder_benchmark(
-    benchmark,
-    size: tuple,
-    dtype: torch.dtype,
-):
-    N, C, H, W = size
-    x = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
-    weight = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True)
-    bias = torch.randn(C, device="cuda", dtype=dtype, requires_grad=True)
-    num_groups = get_n_groups(C)
-    # thunder compiled model
-    groupnorm_fwd_jit = thunder.jit(
-        groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
-    )
-    run_benchmark(benchmark, groupnorm_fwd_jit, [x, weight, bias, num_groups])
-
-
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_groupnorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     N, C, H, W = size
     x = torch.randn(size, device="cuda", dtype=dtype)
@@ -164,8 +145,15 @@ def test_groupnorm_fwd_baseline_benchmark(
     bias = torch.randn(C, device="cuda", dtype=dtype)
     num_groups = get_n_groups(C)
 
+    benchmark_fn = {
+        "eager": groupnorm_fwd,
+        "torchcompile": torch.compile(groupnorm_fwd),
+        "thunder": thunder.jit(
+            groupnorm_fwd, nv_enable_bookend=False, executors=[nvfuserex]
+        ),
+    }
     run_benchmark(
         benchmark,
-        torch.compile(groupnorm_fwd) if compile else groupnorm_fwd,
+        benchmark_fn[executor],
         [x, weight, bias, num_groups],
     )
diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py
index dd8c9f80114..bcb2b4d9268 100644
--- a/benchmarks/python/test_huggingface_attn_bwd.py
+++ b/benchmarks/python/test_huggingface_attn_bwd.py
@@ -107,16 +107,16 @@ def test_huggingface_attn_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_huggingface_attn_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -134,14 +134,17 @@ def huggingface_attn_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": huggingface_attn_fwd,
+        "torchcompile": torch.compile(huggingface_attn_fwd),
+    }
+    outputs = fwd_fn[executor]()
     grads = torch.randn(batch_size * nh, seq_len, seq_len, device="cuda", dtype=dtype)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=huggingface_attn_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py
index 27a013a8481..714a12e41d1 100644
--- a/benchmarks/python/test_huggingface_attn_fwd.py
+++ b/benchmarks/python/test_huggingface_attn_fwd.py
@@ -135,16 +135,16 @@ def test_huggingface_attn_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [attention_mask, inputs])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_huggingface_attn_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -153,10 +153,15 @@ def test_huggingface_attn_fwd_baseline_benchmark(
         batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype
     )
 
+    benchmark_fn = {
+        "eager": huggingface_attn_fwd,
+        "torchcompile": torch.compile(huggingface_attn_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(huggingface_attn_fwd) if compile else huggingface_attn_fwd,
+        benchmark_fn[executor],
         [attention_mask, inputs, size, dropout_p],
         iobytes=huggingface_attn_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_instancenorm_bwd.py b/benchmarks/python/test_instancenorm_bwd.py
index 99d3e3baf2b..4022c5f395f 100644
--- a/benchmarks/python/test_instancenorm_bwd.py
+++ b/benchmarks/python/test_instancenorm_bwd.py
@@ -30,13 +30,13 @@ def test_instancenorm_bwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_instancenorm_bwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_bwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "instance_norm"
+        benchmark, size, dtype, channels_last, executor, "instance_norm"
     )
diff --git a/benchmarks/python/test_instancenorm_fwd.py b/benchmarks/python/test_instancenorm_fwd.py
index 3b8f6564f51..3335fcc7bbf 100644
--- a/benchmarks/python/test_instancenorm_fwd.py
+++ b/benchmarks/python/test_instancenorm_fwd.py
@@ -29,13 +29,13 @@ def test_instancenorm_fwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
 def test_instancenorm_fwd_baseline_benchmark(
-    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, compile: bool
+    benchmark, size: tuple, dtype: torch.dtype, channels_last: bool, executor: str
 ):
     norm_fwd_baseline_benchmark(
-        benchmark, size, dtype, channels_last, compile, "instance_norm"
+        benchmark, size, dtype, channels_last, executor, "instance_norm"
     )
diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py
index d76046575dc..926ab2ef0fb 100644
--- a/benchmarks/python/test_layernorm_bwd.py
+++ b/benchmarks/python/test_layernorm_bwd.py
@@ -146,16 +146,16 @@ def test_layernorm_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_layernorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
 
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
@@ -171,13 +171,16 @@ def layernorm_fwd():
             bias=bias,
         )
 
-    fwd_fn = torch.compile(layernorm_fwd) if compile else layernorm_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": layernorm_fwd,
+        "torchcompile": torch.compile(layernorm_fwd),
+    }
+    outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=layernorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py
index c6a5f24c8dc..52aa5838f62 100644
--- a/benchmarks/python/test_layernorm_fwd.py
+++ b/benchmarks/python/test_layernorm_fwd.py
@@ -106,16 +106,16 @@ def test_layernorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_layernorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, hidden_size = size
     inputs = [
@@ -124,10 +124,15 @@ def test_layernorm_fwd_baseline_benchmark(
         torch.randn(hidden_size, device="cuda", dtype=dtype),
     ]
 
+    benchmark_fn = {
+        "eager": layernorm_fwd,
+        "torchcompile": torch.compile(layernorm_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(layernorm_fwd) if compile else layernorm_fwd,
+        benchmark_fn[executor],
         inputs,
         iobytes=layernorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 865caba2e31..2448ac07fd9 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -25,14 +25,14 @@ def load_matmul_problems():
 
 
 @pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
-@pytest.mark.parametrize("compile", [False], ids=["eager"])
+@pytest.mark.parametrize("executor", ["eager"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 @pytest.mark.parametrize(
     "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
 )
 def test_matmul_baseline_benchmark(
     benchmark,
-    compile: bool,
+    executor: str,
     config: tuple,
     dtype: torch.dtype,
     half_reduction: bool,
diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py
index 2efb8e7d58d..88d8d56e26d 100644
--- a/benchmarks/python/test_nanogpt_attn_bwd.py
+++ b/benchmarks/python/test_nanogpt_attn_bwd.py
@@ -124,16 +124,16 @@ def test_nanogpt_attn_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_nanogpt_attn_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -154,14 +154,18 @@ def nanogpt_attn_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd
-    output = fwd_fn()
+    fwd_fn = {
+        "eager": nanogpt_attn_fwd,
+        "torchcompile": torch.compile(nanogpt_attn_fwd),
+    }
+    outputs = fwd_fn[executor]()
+
     grads = torch.randn(batch_size, nh, seq_len, seq_len, device="cuda", dtype=dtype)
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=nanogpt_attn_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py
index 4dbd5821c59..5336d96cba5 100644
--- a/benchmarks/python/test_nanogpt_attn_fwd.py
+++ b/benchmarks/python/test_nanogpt_attn_fwd.py
@@ -137,16 +137,16 @@ def test_nanogpt_attn_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, bias])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_nanogpt_attn_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     batch_size, seq_len, nh, n_embd = size
     dropout_p = 0.2
@@ -154,10 +154,16 @@ def test_nanogpt_attn_fwd_baseline_benchmark(
     bias = torch.tril(torch.ones(seq_len, seq_len, device="cuda")).view(
         1, 1, seq_len, seq_len
     )
+
+    benchmark_fn = {
+        "eager": nanogpt_attn_fwd,
+        "torchcompile": torch.compile(nanogpt_attn_fwd),
+    }
+
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(nanogpt_attn_fwd) if compile else nanogpt_attn_fwd,
+        benchmark_fn[executor],
         [inputs, bias, size, dropout_p],
         iobytes=nanogpt_attn_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py
index 0162950cc47..31ec20d6b10 100644
--- a/benchmarks/python/test_pointwise_mul.py
+++ b/benchmarks/python/test_pointwise_mul.py
@@ -50,21 +50,26 @@ def test_pointwise_mul_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_pointwise_mul_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
+
+    benchmark_fn = {
+        "eager": pointwise_mul_fwd_fn,
+        "torchcompile": torch.compile(pointwise_mul_fwd_fn),
+    }
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(pointwise_mul_fwd_fn) if compile else pointwise_mul_fwd_fn,
+        benchmark_fn[executor],
         [input],
     )
diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py
index f734769a1e5..303f65609b7 100644
--- a/benchmarks/python/test_reduction.py
+++ b/benchmarks/python/test_reduction.py
@@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -62,14 +62,19 @@ def test_reduction_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
+
+    benchmark_fn = {
+        "eager": reduction_fwd_fn,
+        "torchcompile": torch.compile(reduction_fwd_fn),
+    }
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(reduction_fwd_fn) if compile else reduction_fwd_fn,
+        benchmark_fn[executor],
         [input, reduction_axis],
     )
diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py
index 231090e4135..aacf7326d29 100644
--- a/benchmarks/python/test_reduction_epilogue.py
+++ b/benchmarks/python/test_reduction_epilogue.py
@@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [x, epilogue])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0])
@@ -76,17 +76,21 @@ def test_reduction_epilogue_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     x = torch.randn(size, device="cuda", dtype=dtype)
     epilogue = torch.randn(size[reduction_axis - 1], device="cuda", dtype=dtype)
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
+
+    benchmark_fn = {
+        "eager": reduction_epilogue_fwd_fn,
+        "torchcompile": torch.compile(reduction_epilogue_fwd_fn),
+    }
+
     run_benchmark(
         benchmark,
-        torch.compile(reduction_epilogue_fwd_fn)
-        if compile
-        else reduction_epilogue_fwd_fn,
+        benchmark_fn[executor],
         [x, epilogue, reduction_axis],
     )
diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py
index 697aa8848ab..2fb4698fdbf 100644
--- a/benchmarks/python/test_rmsnorm_bwd.py
+++ b/benchmarks/python/test_rmsnorm_bwd.py
@@ -112,16 +112,16 @@ def test_rmsnorm_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_rmsnorm_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
@@ -134,13 +134,13 @@ def rmsnorm_fwd():
         return output
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(rmsnorm_fwd) if compile else rmsnorm_fwd
-    output = fwd_fn()
+    fwd_fn = {"eager": rmsnorm_fwd, "torchcompile": torch.compile(rmsnorm_fwd)}
+    outputs = fwd_fn[executor]()
 
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=rmsnorm_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py
index b7839b631de..0114ae6507c 100644
--- a/benchmarks/python/test_rmsnorm_fwd.py
+++ b/benchmarks/python/test_rmsnorm_fwd.py
@@ -86,24 +86,28 @@ def test_rmsnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, weights])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_rmsnorm_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(size, device="cuda", dtype=dtype)
     weights = torch.randn(size[1], device="cuda", dtype=dtype)
 
+    benchmark_fn = {
+        "eager": rmsnorm_fwd_fn,
+        "torchcompile": torch.compile(rmsnorm_fwd_fn),
+    }
     # Manually compute IOBytes: See PR #1725
     run_benchmark(
         benchmark,
-        torch.compile(rmsnorm_fwd_fn) if compile else rmsnorm_fwd_fn,
+        benchmark_fn[executor],
         [inputs, weights],
         iobytes=rmsnorm_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py
index a85c62a1592..c98d32382b5 100644
--- a/benchmarks/python/test_scale_bias_relu_bwd.py
+++ b/benchmarks/python/test_scale_bias_relu_bwd.py
@@ -79,16 +79,16 @@ def test_sbr_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_sbr_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(*size, device="cuda", dtype=dtype)
@@ -99,12 +99,12 @@ def sbr_fwd():
         return torch.nn.functional.relu(inputs * scale + bias)
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(sbr_fwd) if compile else sbr_fwd
-    eager_output = sbr_fwd()
+    fwd_fn = {"eager": sbr_fwd, "torchcompile": torch.compile(sbr_fwd)}
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [eager_output, grads],
+        [outputs, grads],
         iobytes=sbr_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py
index ede13dbb767..c09b11296c3 100644
--- a/benchmarks/python/test_scale_bias_relu_fwd.py
+++ b/benchmarks/python/test_scale_bias_relu_fwd.py
@@ -82,24 +82,26 @@ def test_sbr_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [bias, scale, inputs])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_sbr_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     bias = torch.ones(size[-1], device="cuda", dtype=dtype)
     scale = torch.ones(size[-1], device="cuda", dtype=dtype)
 
+    benchmark_fn = {"eager": sbr_fwd_fn, "torchcompile": torch.compile(sbr_fwd_fn)}
+
     run_benchmark(
         benchmark,
-        torch.compile(sbr_fwd_fn) if compile else sbr_fwd_fn,
+        benchmark_fn[executor],
         [bias, scale, inputs],
         iobytes=sbr_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py
index 98995e860b1..25276dec474 100644
--- a/benchmarks/python/test_silu_mul_bwd.py
+++ b/benchmarks/python/test_silu_mul_bwd.py
@@ -79,16 +79,16 @@ def test_silu_mul_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, x, y])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_silu_mul_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     x = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     y = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
@@ -98,12 +98,12 @@ def silu_mul_fwd():
         return torch.nn.functional.silu(x) * y
 
     # Compile the fwd fn for torchcompile
-    fwd_fn = torch.compile(silu_mul_fwd) if compile else silu_mul_fwd
-    eager_output = fwd_fn()
+    fwd_fn = {"eager": silu_mul_fwd, "torchcompile": torch.compile(silu_mul_fwd)}
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [eager_output, grads],
+        [outputs, grads],
         iobytes=silu_mul_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py
index 0f1e86d0d56..3de05067cb2 100644
--- a/benchmarks/python/test_silu_mul_fwd.py
+++ b/benchmarks/python/test_silu_mul_fwd.py
@@ -56,22 +56,27 @@ def test_silu_mul_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_silu_mul_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     inputs = [torch.randn(*size, device="cuda", dtype=dtype) for _ in range(2)]
 
+    benchmark_fn = {
+        "eager": silu_mul_fwd_fn,
+        "torchcompile": torch.compile(silu_mul_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(silu_mul_fwd_fn) if compile else silu_mul_fwd_fn,
+        benchmark_fn[executor],
         inputs,
     )
diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py
index 86f22654380..049da18fe27 100644
--- a/benchmarks/python/test_softmax_bwd.py
+++ b/benchmarks/python/test_softmax_bwd.py
@@ -91,7 +91,7 @@ def test_softmax_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -100,9 +100,9 @@ def test_softmax_bwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(size, device="cuda", dtype=dtype)
@@ -110,12 +110,12 @@ def test_softmax_bwd_baseline_benchmark(
     def softmax_fwd():
         return torch.nn.functional.softmax(input, dim=reduction_axis)
 
-    fwd_fn = torch.compile(softmax_fwd) if compile else softmax_fwd
-    output = fwd_fn()
+    fwd_fn = {"eager": softmax_fwd, "torchcompile": torch.compile(softmax_fwd)}
+    outputs = fwd_fn[executor]()
 
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [output, grads],
+        [outputs, grads],
         iobytes=softmax_bwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py
index 2e672eb2e30..d138aa1ced1 100644
--- a/benchmarks/python/test_softmax_fwd.py
+++ b/benchmarks/python/test_softmax_fwd.py
@@ -81,7 +81,7 @@ def test_softmax_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
@@ -90,15 +90,19 @@ def test_softmax_fwd_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input = torch.randn(size, device="cuda", dtype=dtype)
 
+    benchmark_fn = {
+        "eager": softmax_fwd_fn,
+        "torchcompile": torch.compile(softmax_fwd_fn),
+    }
     run_benchmark(
         benchmark,
-        torch.compile(softmax_fwd_fn) if compile else softmax_fwd_fn,
+        benchmark_fn[executor],
         [input, reduction_axis],
         iobytes=softmax_fwd_iobytes(size, dtype),
     )
diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index cf290f278a5..a4e3198cc9a 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [input1, input2])
 
 
-@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=3))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)])
@@ -83,15 +83,21 @@ def test_transpose_baseline_benchmark(
     size: tuple,
     dtype: torch.dtype,
     axes: list,
-    compile: bool,
+    executor: str,
 ):
-    if compile:
+    if executor == "torchcompile":
         clear_dynamo_cache()
     input1 = torch.randn(size, device="cuda", dtype=dtype)
     input2 = torch.randn(size, device="cuda", dtype=dtype)
+
+    benchmark_fn = {
+        "eager": transpose_fwd_fn,
+        "torchcompile": torch.compile(transpose_fwd_fn),
+    }
+
     # Inputs and outputs are same as nvFuser, no need for manual IOByte computation
     run_benchmark(
         benchmark,
-        torch.compile(transpose_fwd_fn) if compile else transpose_fwd_fn,
+        benchmark_fn[executor],
         [input1, input2, axes[0], axes[1]],
     )
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
index 4cec43b2c92..727894caebb 100644
--- a/csrc/codegen.cpp
+++ b/csrc/codegen.cpp
@@ -402,6 +402,55 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
     }
   }
 
+  void generateVectorizedLdSt(
+      Val* in,
+      Val* out,
+      CacheOp cache_op,
+      int64_t vector_word_size) {
+    auto out_tv = out->as<kir::TensorIndex>()->view();
+    auto in_tv = in->as<kir::TensorIndex>()->view();
+
+    bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
+        in_tv->getMemoryType() == MemoryType::Local;
+
+    bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local &&
+        in_tv->getMemoryType() == MemoryType::Global;
+
+    bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
+        in_tv->getMemoryType() == MemoryType::Global;
+
+    bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global &&
+        kernel_->summary().sync_map->needsRawSync(out_tv).hasBID();
+
+    bool is_volatile_from = in_tv->getMemoryType() == MemoryType::Global &&
+        kernel_->summary().sync_map->needsRawSync(in_tv).hasBID();
+
+    if (localToGlobal) {
+      code_ << "loadLocalToGlobal<" << out->dtype() << ", /*vec_size=*/"
+            << vector_word_size << ", /*is_volatile=*/"
+            << (is_volatile_to ? "true" : "false") << ">(";
+      code_ << " &" << gen(out) << ", &" << gen(in) << ")";
+    } else if (globalToLocal) {
+      code_ << "loadGlobalToLocal<" << out->dtype() << ", /*vec_size=*/"
+            << vector_word_size << ", /*is_volatile=*/"
+            << (is_volatile_from ? "true" : "false") << ", "
+            << "CacheOp::" << cache_op << ">(&" << gen(out) << ", ";
+      code_ << " &" << gen(in) << ")";
+    } else if (globalToGlobal) {
+      code_ << "loadGlobalToGlobal<" << out->dtype() << ", /*vec_size=*/"
+            << vector_word_size << ", /*is_volatile_to=*/"
+            << (is_volatile_to ? "true" : "false") << ", /*is_volatile_from=*/"
+            << (is_volatile_from ? "true" : "false") << ">(";
+      code_ << " &" << gen(out) << ", ";
+      code_ << " &" << gen(in) << ")";
+    } else {
+      code_ << "loadGeneric<" << out->dtype() << ", " << vector_word_size
+            << ">(";
+      code_ << " &" << gen(out) << ", ";
+      code_ << " &" << gen(in) << ")";
+    }
+  }
+
   // Cannot just use ConstIrVisitor::handle as it expects a vector of
   // const Expr*, whereas most of the IR API returns a vector of
   // non-const Expr*.
@@ -1001,6 +1050,68 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
   }
 
   void handle(const TernaryOp* top) final {
+    // Note: vectorized TernaryOp looks something like:
+    //   ```
+    //     predicate
+    //       ? LoadGlobalToLocal(&dst[0], &in2[index])
+    //       : arraySet(&dst[0], in3);
+    //   ```
+    //
+    // Current limitation:
+    //   1. only TernaryOpType::Where is supported;
+    //   2. predicate needs to be a scalar;
+    //   3. output needs to be a TensorView;
+    //   4. one and only one of the inputs needs to be a TensorView. (This is
+    //   coming from validation analysis.)
+    if (top->out()->isA<kir::TensorIndex>()) {
+      // Get vectorization information
+      auto out_tv = top->out()->as<kir::TensorIndex>()->view();
+      int64_t vector_word_size = ir_utils::getVectorizeSize(out_tv);
+      bool is_vector_op = vectorize_scope_ && vector_word_size != 1;
+
+      if (is_vector_op) {
+        NVF_CHECK(
+            top->in1()->isScalar(),
+            "predicate should be a scalar for vectorized TernaryOp::where");
+        NVF_CHECK(
+            !top->out()->isScalar(),
+            "scalar output in vectorization isn't supported");
+        NVF_CHECK(
+            top->getTernaryOpType() == TernaryOpType::Where,
+            "vectorization only works on TernaryOp::where");
+        indent() << gen(top->in1()) << "\n";
+        indent() << kTab << "? ";
+        auto vec_load = [&out_tv, &top, &vector_word_size, this](Val* in) {
+          if (in->isScalar()) {
+            if (out_tv->getMemoryType() == MemoryType::Local &&
+                !out_tv->isCircularBuffered()) {
+              // Vectorized initialization, explicit type conversion is needed
+              // for complex numbers
+              code_ << genVariableName(out_tv) << ".set("
+                    << genCall(out_tv->dtype(), gen(in)) << ")";
+            } else {
+              // Note: currently arraySet option is not vectorized, so it will
+              //  rely on auto vectorization pass of cuda compiler.
+              code_ << "arraySet<" << out_tv->getDataType().value() << ", "
+                    << vector_word_size << ">(&" << gen(top->out()) << ", ("
+                    << out_tv->getDataType().value() << ")" << gen(in) << ")";
+            }
+          } else {
+            generateVectorizedLdSt(
+                in, top->out(), CacheOp::AllLevels, vector_word_size);
+          }
+        };
+
+        // TODO: should we have the option to specify cache level?
+        vec_load(top->in2());
+        code_ << "\n";
+        indent() << kTab << ": ";
+        vec_load(top->in3());
+        code_ << ";\n";
+        return;
+      }
+    }
+
     if (!print_inline_) {
       indent() << gen(top->out());
       if (!top->out()->isScalar()) {
@@ -1338,53 +1449,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
               "Invalid input to unary op with tensor output, found: ",
               ldst->in()->toString());
 
-          auto in_tv = ldst->in()->as<kir::TensorIndex>()->view();
-          bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
-              in_tv->getMemoryType() == MemoryType::Local;
-
-          bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local &&
-              in_tv->getMemoryType() == MemoryType::Global;
-
-          bool globalToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
-              in_tv->getMemoryType() == MemoryType::Global;
-
-          bool is_volatile_to = out_tv->getMemoryType() == MemoryType::Global &&
-              kernel_->summary().sync_map->needsRawSync(out_tv).hasBID();
-
-          bool is_volatile_from =
-              in_tv->getMemoryType() == MemoryType::Global &&
-              kernel_->summary().sync_map->needsRawSync(in_tv).hasBID();
-
-          if (localToGlobal) {
-            indent() << "loadLocalToGlobal<" << ldst->out()->dtype()
-                     << ", /*vec_size=*/" << vector_word_size
-                     << ", /*is_volatile=*/"
-                     << (is_volatile_to ? "true" : "false") << ">(";
-            code_ << " &" << gen(ldst->out()) << ", &" << gen(ldst->in())
-                  << ");\n";
-          } else if (globalToLocal) {
-            indent() << "loadGlobalToLocal<" << ldst->out()->dtype()
-                     << ", /*vec_size=*/" << vector_word_size
-                     << ", /*is_volatile=*/"
-                     << (is_volatile_from ? "true" : "false") << ", "
-                     << "CacheOp::" << ldst->cacheOp() << ">(&"
-                     << gen(ldst->out()) << ", ";
-            code_ << " &" << gen(ldst->in()) << ");\n";
-          } else if (globalToGlobal) {
-            indent() << "loadGlobalToGlobal<" << ldst->out()->dtype()
-                     << ", /*vec_size=*/" << vector_word_size
-                     << ", /*is_volatile_to=*/"
-                     << (is_volatile_to ? "true" : "false")
-                     << ", /*is_volatile_from=*/"
-                     << (is_volatile_from ? "true" : "false") << ">(";
-            code_ << " &" << gen(ldst->out()) << ", ";
-            code_ << " &" << gen(ldst->in()) << ");\n";
-          } else {
-            indent() << "loadGeneric<" << ldst->out()->dtype() << ", "
-                     << vector_word_size << ">(";
-            code_ << " &" << gen(ldst->out()) << ", ";
-            code_ << " &" << gen(ldst->in()) << ");\n";
-          }
+          indent();
+          generateVectorizedLdSt(
+              ldst->in(), ldst->out(), ldst->cacheOp(), vector_word_size);
+          code_ << ";\n";
         }
         return;
       }
diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
index 38f914b92ab..16d2177ca33 100644
--- a/csrc/device_lower/lower2device.h
+++ b/csrc/device_lower/lower2device.h
@@ -45,10 +45,6 @@
 
 namespace nvfuser {
 
-// TODO: we frequently use pairwise root mapping from consumers to producers.
-// This information is implicitly in the computeAtMaps, but there's no isolated
-// container for this information that we can reuse. Would be nice to generate
-// such a structure and propagate it through lowering.
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 class GpuLower : public NonCopyable {
   class KernelIrMapper;
diff --git a/csrc/device_lower/pass/circular_buffer.cpp b/csrc/device_lower/pass/circular_buffer.cpp
index f2eb30297a5..273cee42ba9 100644
--- a/csrc/device_lower/pass/circular_buffer.cpp
+++ b/csrc/device_lower/pass/circular_buffer.cpp
@@ -97,7 +97,7 @@ class CircularBufferLoopCloner : public kir::IrVisitor {
       }
       case CircularBufferLoopStage::Main: {
         if (requireEpilogue(circular_buffer_load_exprs_)) {
-          stop = IrBuilder::subExpr(
+          stop = SimplifyingIrBuilder::subExpr(
               circular_buffer_loop_->stop(),
               SimplifyingIrBuilder::create<Val>(
                   prefetch_distance, DataType::Index));
@@ -106,7 +106,7 @@ class CircularBufferLoopCloner : public kir::IrVisitor {
       }
       case CircularBufferLoopStage::Epilog: {
         NVF_ERROR(requireEpilogue(circular_buffer_load_exprs_));
-        start = IrBuilder::subExpr(
+        start = SimplifyingIrBuilder::subExpr(
             circular_buffer_loop_->stop(),
             SimplifyingIrBuilder::create<Val>(
                 prefetch_distance, DataType::Index));
@@ -424,7 +424,7 @@ class CloneTmaCircularBufferLoopAndInsertSync
     int64_t stage_depth =
         GpuLower::current()->circularBufferInfo().getStageDepthFor(
             circular_buffer_loop_->iter_domain());
-    Val* result = IrBuilder::modExpr(
+    Val* result = SimplifyingIrBuilder::modExpr(
         cloned_top_level_loop_->indexOrStartIfTrivial(),
         IrBuilder::create<Val>(stage_depth, PrimDataType::Index));
     return GpuLower::current()->commonScalarMap().hoistScalar(
@@ -441,8 +441,8 @@ class CloneTmaCircularBufferLoopAndInsertSync
         GpuLower::current()->circularBufferInfo().getPrefetchDistanceFor(
             circular_buffer_loop_->iter_domain());
 
-    auto current_load_stage = IrBuilder::modExpr(
-        IrBuilder::addExpr(
+    auto current_load_stage = SimplifyingIrBuilder::modExpr(
+        SimplifyingIrBuilder::addExpr(
             cloned_top_level_loop_->indexOrStartIfTrivial(),
             IrBuilder::create<Val>(prefetch_distance, PrimDataType::Index)),
         IrBuilder::create<Val>(stage_depth, PrimDataType::Index));
diff --git a/csrc/device_lower/pass/predicate.cpp b/csrc/device_lower/pass/predicate.cpp
index 034534be7be..18e632d9e9e 100644
--- a/csrc/device_lower/pass/predicate.cpp
+++ b/csrc/device_lower/pass/predicate.cpp
@@ -103,7 +103,8 @@ class ConditionalFromPredicateModifier : public kir::ExprMutator {
               "Expecting predicated body to only have one vectorized expression.");
           auto vec_expr = ite->thenBody()[0];
           NVF_ERROR(
-              vec_expr->isA<UnaryOp>() || vec_expr->isA<LoadStoreOp>(),
+              vec_expr->isA<UnaryOp>() || vec_expr->isA<LoadStoreOp>() ||
+                  vec_expr->isA<TernaryOp>(),
               "Vectorize predicate exprs only supported on set operations.");
           NVF_ERROR(
               ir_utils::isTvOp(vec_expr),
diff --git a/csrc/device_lower/pass/replace_size.cpp b/csrc/device_lower/pass/replace_size.cpp
index 1e1bc8b9738..1ace266794c 100644
--- a/csrc/device_lower/pass/replace_size.cpp
+++ b/csrc/device_lower/pass/replace_size.cpp
@@ -59,28 +59,38 @@ std::unordered_map<Val*, Val*> getSimplificationMap(Fusion* fusion) {
     // 1. Constant ints. These might be non-immediate constants
     // 2. Extents of input TVs.
     // 3. Extents of non-input TVs.
-    // Within these three classes, we find the IterDomain with the smallest
-    // name().
+    // Within these three classes, we find the IterDomain with the
+    // smallest name(). For case 3, we also prefer the IterDomain with
+    // the simplest extent, which has the smallest number of defining
+    // expessions.
     bool group_is_const = false;
     IterDomain* rep = nullptr;
     bool rep_is_input_id = false;
+    int64_t rep_num_defs = 0;
     std::unordered_set<Val*> dynamic_scalars;
     for (Val* v : *group) {
       auto* id = dynamic_cast<IterDomain*>(v);
       NVF_ERROR(
           id != nullptr, "Expected only IterDomains in exact graph ValGroups");
       bool is_input_id = fusion_input_ids.count(id) > 0;
-      if (rep == nullptr) {
-        rep = id;
-        rep_is_input_id = is_input_id;
-        continue;
-      }
       Val* ext = id->extent();
       bool ext_is_const = ext->isConstInt();
       if (!ext_is_const) {
         dynamic_scalars.insert(ext);
       }
 
+      // Initializing rep with the first ID
+      if (rep == nullptr) {
+        rep = id;
+        rep_is_input_id = is_input_id;
+        group_is_const = ext_is_const;
+        // If neigher const nor input, record the number of exprs
+        if (!ext_is_const && !is_input_id) {
+          rep_num_defs = ir_utils::getOperationCount(id->extent());
+        }
+        continue;
+      }
+
       if (ext_is_const) {
         if (!group_is_const || id->name() < rep->name()) {
           rep = id;
@@ -103,9 +113,11 @@ std::unordered_map<Val*, Val*> getSimplificationMap(Fusion* fusion) {
         if (group_is_const || rep_is_input_id) {
           continue;
         }
-        if (id->name() < rep->name()) {
+        auto num_defs = ir_utils::getOperationCount(id->extent());
+        if (num_defs < rep_num_defs || id->name() < rep->name()) {
           rep = id;
           rep_is_input_id = is_input_id;
+          rep_num_defs = num_defs;
           continue;
         }
       }
diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index 00bb3a1d458..d10ffb90e50 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -1907,6 +1907,11 @@ Val* proveLinearAndGetStride(
     const ValGroup& linear_g,
     const ValGroups& domain) {
   FusionGuard fg(linear_g->front()->fusion());
+  // This function uses simplifyExpr extensively. If we have disable expression
+  // simplification in order to help inspect generated kernels then we will get
+  // incorrect results here. Instead, we ensure it is enabled using this guard.
+  DisableOptionsGuard dog;
+  DisableOptionsGuard::getCurOptions().unset(DisableOption::ExprSimplify);
   if (simplifyExpr(extent(linear_g))->isOne()) {
     // If the extent of the linear group is 1, we always consider it as linear,
     // regardless of its relationship with domain. For this case, we use stride
diff --git a/csrc/device_lower/validation.cpp b/csrc/device_lower/validation.cpp
index ef10cdb6bc1..cc1b2dec53a 100644
--- a/csrc/device_lower/validation.cpp
+++ b/csrc/device_lower/validation.cpp
@@ -668,17 +668,31 @@ class VectorizeValidator : public OptInDispatch {
         tv_def != nullptr,
         "Tv has no definition, cannot validate vectorization:",
         tv);
-    auto producer_tv = tv_def->inputs().at(0)->as<TensorView>();
-    auto producer_word_size_it =
-        GpuLower::current()->vectorizedAccesses().find(producer_tv);
-    if (producer_word_size_it !=
-        GpuLower::current()->vectorizedAccesses().end()) {
-      producer_word_size_it->second =
-          std::max(vector_word_size, producer_word_size_it->second);
-    } else {
-      GpuLower::current()->vectorizedAccesses().emplace(
-          producer_tv, vector_word_size);
+    // TernaryOp(where) is a could have multiple inputs. But we only support
+    // single TensorView input for vectorization.
+    TensorView* producer_tv = nullptr;
+    for (auto input : tv_def->inputs()) {
+      if (!input->isA<TensorView>()) {
+        continue;
+      }
+      NVF_ERROR(
+          producer_tv == nullptr,
+          "Vectorization validation only support op with a single TensorView input");
+      producer_tv = input->as<TensorView>();
+      auto producer_word_size_it =
+          GpuLower::current()->vectorizedAccesses().find(producer_tv);
+      if (producer_word_size_it !=
+          GpuLower::current()->vectorizedAccesses().end()) {
+        producer_word_size_it->second =
+            std::max(vector_word_size, producer_word_size_it->second);
+      } else {
+        GpuLower::current()->vectorizedAccesses().emplace(
+            producer_tv, vector_word_size);
+      }
     }
+    NVF_ERROR(
+        producer_tv != nullptr,
+        "Vectorization validation requires a TensorView input");
 
     VectorizedSetInfo vectorized_set_info;
     vectorized_set_info.consumer_tv = tv;
@@ -798,6 +812,10 @@ void validateAndCollectVectorizeInfo(Fusion* fusion) {
       Expr* def = tv->definition();
       NVF_ERROR(
           def == nullptr || def->isA<LoadStoreOp>() || def->isA<SliceOp>() ||
+              def->isA<PadOp>() ||
+              (def->isA<TernaryOp>() &&
+               def->as<TernaryOp>()->getTernaryOpType() ==
+                   TernaryOpType::Where) ||
               (def->isA<ReductionOp>() &&
                def->as<ReductionOp>()->serialGridReductionRequested()),
           "Vectorized accesses cannot be inline with computation: ",
diff --git a/csrc/disjoint_set.h b/csrc/disjoint_set.h
index c1638ae2037..568f9ff2604 100644
--- a/csrc/disjoint_set.h
+++ b/csrc/disjoint_set.h
@@ -153,25 +153,25 @@ class VectorOfUniqueEntries {
 
   // Returns first element in vector
   T front() const {
-#ifndef NDEBUG
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK)
     NVF_ERROR(!empty());
-#endif // NDEBUG
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK)
     return vector_.front();
   }
 
   // Returns last element in vector
   T back() const {
-#ifndef NDEBUG
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK)
     NVF_ERROR(!empty());
-#endif // NDEBUG
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK)
     return vector_.back();
   }
 
   // Remove and returns the last element in vector
   T popBack() {
-#ifndef NDEBUG
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK)
     NVF_ERROR(!empty());
-#endif // NDEBUG
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_CHECK)
     T v = vector_.back();
     set_.erase(v);
     vector_.pop_back();
diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp
index d9d9cc08003..ab1f44b4571 100644
--- a/csrc/fusion.cpp
+++ b/csrc/fusion.cpp
@@ -752,27 +752,6 @@ std::vector<Val*> Fusion::getTerminatingOutputs() const {
   return terminating_outputs;
 }
 
-bool Fusion::isAliasCompatible(Val* left, Val* right) {
-  // Nullptr check
-  if (left == nullptr || right == nullptr) {
-    return false;
-  }
-
-  // DataType check
-  if (!left->getDataType().has_value() || !right->getDataType().has_value() ||
-      left->getDataType().value() != right->getDataType().value()) {
-    return false;
-  }
-
-  // ValType check
-  if (!left->getValType().has_value() || !right->getValType().has_value() ||
-      left->getValType().value() != right->getValType().value()) {
-    return false;
-  }
-
-  return true;
-}
-
 void Fusion::aliasOutputToInput(
     Val* output,
     Val* input,
@@ -791,33 +770,16 @@ void Fusion::aliasOutputToInput(
   }
 
   NVF_ERROR(type == AllocationType::ReuseBuffer);
-  // `input` can be a cast of a fusion input.
-  if (!input->isFusionInput()) {
-    auto input_expr = input->definition();
-    NVF_ERROR(
-        input_expr->isA<UnaryOp>(), "expected unary op for aliased input");
-    auto input_uop = input_expr->as<UnaryOp>();
-    NVF_ERROR(
-        input_uop->getUnaryOpType() == UnaryOpType::Cast,
-        "expected aliased input to be output of cast op");
-    input = input_uop->in();
-  }
+  NVF_ERROR(input->isFusionInput(), "alias source can only be a fusion input");
   NVF_ERROR(
       input->getDataType().has_value() && output->getDataType().has_value(),
       "requires DataType to be available for aliased output to input");
 
-  if (input->getDataType().value() != output->getDataType().value()) {
-    output = castOp(input->getDataType().value(), output);
-  }
-
   if (output->isFusionInput()) {
     // ensure that codegen produce a write operation on the buffer.
     output = set(output);
   }
 
-  NVF_ERROR(
-      isAliasCompatible(input, output),
-      "The input and output values are not alias-compatible.");
   // Let integration hide any output that wasn't a fusion output when
   // `aliasOutputToInput` was called. For example, running mean and var for
   // batch norm.
diff --git a/csrc/fusion.h b/csrc/fusion.h
index 7b72aef0414..871dda53811 100644
--- a/csrc/fusion.h
+++ b/csrc/fusion.h
@@ -403,7 +403,7 @@ class NVF_API Fusion : public IrContainer {
   static IrCloner copy(const Fusion* from, Fusion* to);
 
   //! During scheduling, this can be set to a non-negative value. If done, then
-  //! during execution by FusionExecutor, we will check that this value matches
+  //! during execution by KernelExecutor, we will check that this value matches
   //! the corresponding value in LaunchParams.
   int64_t expectedDynamicSmemBytes() const {
     return expected_dynamic_smem_bytes_;
@@ -464,11 +464,6 @@ class NVF_API Fusion : public IrContainer {
     all_tvs_ptr_.reset();
   }
 
- private:
-  // Determine if the two values are compatible for aliasing
-  // Same DataType, ValType, and number of dimensions
-  bool isAliasCompatible(Val* left, Val* right);
-
  private:
   // Fusion inputs and outputs
   std::vector<Val*> inputs_;
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 402784153a0..c3132067067 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -141,7 +141,7 @@ void HostIrExecutor::handle(PostOnStream* post_ir) {
       "op must be a HostUnit: ",
       post_ir->hostOpToPost());
   auto hu = post_ir->hostOpToPost()->as<HostUnit>();
-  // Compile the fusion and execute it with FusionExecutor(Cache)
+  // Compile the fusion and execute it with KernelExecutor(Cache)
   // Check if the executor has been cached. If not, create and cache it
   if (params_.use_fusion_executor_cache) {
     if (!fec_.count(hu)) {
@@ -153,13 +153,13 @@ void HostIrExecutor::handle(PostOnStream* post_ir) {
     }
     outputs = fec_.at(hu).runFusionWithInputs(input_IValues);
   } else {
-    FusionExecutor& fe = fe_[hu];
-    if (!fe.isCompiled()) {
+    KernelExecutor& ke = fe_[hu];
+    if (!ke.isCompiled()) {
       Fusion* fusion = hu->fusion_to_execute();
       DynamicTransform::concretizeFusion(fusion, input_IValues);
-      fe.compileFusion(fusion, input_IValues);
+      ke.compile(fusion, input_IValues);
     }
-    outputs = fe.runFusion(input_IValues);
+    outputs = ke.run(input_IValues);
     if (!params_.cache_fusion_executor) {
       fe_.erase(hu);
     }
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index 2dcec129cc8..96bcc725d7b 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -36,7 +36,7 @@ duplication will be resolved in the future.
 // Set of parameters that control the behavior of HostIrExecutor
 struct HostIrExecutorParams {
   // Experimental: whether to use FusionExecutorCache rather than
-  // FusionExecutor.
+  // KernelExecutor.
   bool use_fusion_executor_cache = false;
   // Experimental: whether to apply auto-scheduling in FusionExecutorCache if
   // use_fusion_executor_cache=true. WAR: temporary hack mainly use for
@@ -95,7 +95,7 @@ class HostIrExecutor final : public OptOutDispatch {
   // Stores concrete computed values
   ExpressionEvaluator expr_evaluator_;
   // Cache Fusions, FusionExecutors
-  std::unordered_map<HostUnit*, FusionExecutor> fe_;
+  std::unordered_map<HostUnit*, KernelExecutor> fe_;
   std::unordered_map<HostUnit*, FusionExecutorCache> fec_;
   using StreamKey = std::variant<int64_t, Stream*>;
   std::unordered_map<StreamKey, c10::cuda::CUDAStream> streams_;
diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h
index 6ac52ba0564..f9f422cd994 100644
--- a/csrc/ir/internal_base_nodes.h
+++ b/csrc/ir/internal_base_nodes.h
@@ -120,18 +120,23 @@ class NVF_API IterDomain : public Val {
   static std::vector<IterDomain*> clone(
       const std::vector<IterDomain*>& domains);
 
-  //! When `rfactor_domain` is true, also set the `is_rfactor_domain_` flag of
-  //! the result IterDomain.
+  //! The optional parameters of rfactor_domain and iter_type can be
+  //! used to override the default behavior.
   static IterDomain* merge(
       IterDomain* outer,
       IterDomain* inner,
-      bool rfactor_domain = false);
+      std::optional<bool> rfactor_domain = std::nullopt,
+      std::optional<IterType> iter_type = std::nullopt);
 
+  //! The optional parameters of rfactor_domain, outer_iter_type and
+  //! inner_iter_type can be used to override the default behavior.
   static std::pair<IterDomain*, IterDomain*> split(
       IterDomain* in,
       Val* factor,
       bool inner_split,
-      bool rfactor_domain = false);
+      std::optional<bool> rfactor_domain = std::nullopt,
+      std::optional<IterType> outer_iter_type = std::nullopt,
+      std::optional<IterType> inner_iter_type = std::nullopt);
 
   //! Resize an IterDomain by expanding both the left and right sides
   //! by given widths. The resulting IterDomain has an extent of
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index ca39d51684a..0ff0e5c6bf3 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -2550,7 +2550,8 @@ IterDomain* IterDomain::cloneWithoutRFactor(bool map_with_original) {
 IterDomain* IterDomain::merge(
     IterDomain* outer,
     IterDomain* inner,
-    bool rfactor_domain) {
+    std::optional<bool> rfactor_domain,
+    std::optional<IterType> iter_type) {
   NVF_CHECK(
       outer->isReduction() == inner->isReduction(),
       "Merging IterDomains requires that their iteration types match. ",
@@ -2563,24 +2564,33 @@ IterDomain* IterDomain::merge(
       !outer->isStride() && !inner->isStride(),
       "No support for merging stride domains");
 
+  // By default, if not specified, don't create rfactor
+  // outputs. Reshape transformations should propagate the flag, which
+  // should explicitly specify the flag
+  if (!rfactor_domain.has_value()) {
+    rfactor_domain = false;
+  }
+
   Val* merged_id_size = mul(outer->extent(), inner->extent());
 
-  IterType itype = outer->getIterType();
+  if (!iter_type.has_value()) {
+    iter_type = outer->getIterType();
 
-  if (outer->isBroadcast() && inner->isBroadcast()) {
-    itype = IterType::Broadcast;
-  }
+    if (outer->isBroadcast() && inner->isBroadcast()) {
+      iter_type = IterType::Broadcast;
+    }
 
-  if ((outer->isBroadcast() || inner->isBroadcast()) &&
-      (outer->getIterType() == IterType::Iteration ||
-       inner->getIterType() == IterType::Iteration)) {
-    itype = IterType::Iteration;
-  }
+    if ((outer->isBroadcast() || inner->isBroadcast()) &&
+        (outer->getIterType() == IterType::Iteration ||
+         inner->getIterType() == IterType::Iteration)) {
+      iter_type = IterType::Iteration;
+    }
 
-  if ((outer->isBroadcast() || inner->isBroadcast()) &&
-      (outer->getIterType() == IterType::GatherScatter ||
-       inner->getIterType() == IterType::GatherScatter)) {
-    itype = IterType::GatherScatter;
+    if ((outer->isBroadcast() || inner->isBroadcast()) &&
+        (outer->getIterType() == IterType::GatherScatter ||
+         inner->getIterType() == IterType::GatherScatter)) {
+      iter_type = IterType::GatherScatter;
+    }
   }
 
   Val* expanded_extent = nullptr;
@@ -2606,8 +2616,8 @@ IterDomain* IterDomain::merge(
       IterDomainBuilder(outer->container()->zeroVal(), merged_id_size)
           .parallel_type(outer->getParallelType())
           .expanded_extent(expanded_extent)
-          .iter_type(itype)
-          .is_rfactor_domain(rfactor_domain)
+          .iter_type(*iter_type)
+          .is_rfactor_domain(*rfactor_domain)
           .build();
 
   IrBuilder::createInContainer<Merge>(
@@ -2620,7 +2630,9 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
     IterDomain* in,
     Val* factor,
     bool inner_split,
-    bool rfactor_domain) {
+    std::optional<bool> rfactor_domain,
+    std::optional<IterType> outer_iter_type,
+    std::optional<IterType> inner_iter_type) {
   NVF_CHECK(
       factor->isIntegralScalar(), "Cannot split by non-integer value ", factor);
 
@@ -2631,6 +2643,22 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
     expanded_remainder = ceilDiv(in->expandedExtent(), factor);
   }
 
+  // By default, if not specified, don't create rfactor
+  // outputs. Reshape transformations should propagate the flag, which
+  // should explicitly specify the flag
+  if (!rfactor_domain.has_value()) {
+    rfactor_domain = false;
+  }
+
+  // If not specified, inherit these properties from the input iter domain
+  if (!outer_iter_type.has_value()) {
+    outer_iter_type = in->getIterType();
+  }
+
+  if (!inner_iter_type.has_value()) {
+    inner_iter_type = in->getIterType();
+  }
+
   // outer loop IterDomain
   IterDomain* ido =
       IterDomainBuilder(
@@ -2639,8 +2667,8 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
               in->hasExpandedExtent() && inner_split ? expanded_remainder
                                                      : nullptr)
           .parallel_type(in->getParallelType())
-          .iter_type(in->getIterType())
-          .is_rfactor_domain(rfactor_domain)
+          .iter_type(*outer_iter_type)
+          .is_rfactor_domain(*rfactor_domain)
           .build();
 
   // inner loop IterDomain
@@ -2651,8 +2679,8 @@ std::pair<IterDomain*, IterDomain*> IterDomain::split(
               in->hasExpandedExtent() && !inner_split ? expanded_remainder
                                                       : nullptr)
           .parallel_type(in->getParallelType())
-          .iter_type(in->getIterType())
-          .is_rfactor_domain(rfactor_domain)
+          .iter_type(*inner_iter_type)
+          .is_rfactor_domain(*rfactor_domain)
           .build();
 
   IrBuilder::createInContainer<Split>(
@@ -4853,6 +4881,13 @@ bool ForLoop::isTrivial() const {
     return true;
   }
 
+  if (start()->isConstScalar() && simplifiedStop()->isConstScalar() &&
+      start()->evaluate().as<int64_t>() + 1 ==
+          simplifiedStop()->evaluate().as<int64_t>() &&
+      step()->isOneInt()) {
+    return true;
+  }
+
   return false;
 }
 
diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp
index 91a1170bf38..868ba36144d 100644
--- a/csrc/ir/utils.cpp
+++ b/csrc/ir/utils.cpp
@@ -1204,6 +1204,66 @@ bool isFunctional(const Val* v) {
   return std::all_of(def->inputs().begin(), def->inputs().end(), isFunctional);
 }
 
+bool isRecursivelyDefined(Val* val) {
+  NVF_ERROR(val != nullptr);
+
+  std::deque<Val*> vals_to_visit;
+  vals_to_visit.push_back(val);
+
+  std::unordered_set<Val*> visited_vals;
+
+  while (!vals_to_visit.empty()) {
+    auto v = vals_to_visit.front();
+    vals_to_visit.pop_front();
+
+    visited_vals.insert(v);
+
+    auto v_def = v->definition();
+    if (v_def == nullptr) {
+      continue;
+    }
+
+    for (const auto inp : v_def->inputs()) {
+      if (inp == val) {
+        // Recursive dependency detected
+        return true;
+      }
+      // Don't visit the same multiple times
+      if (!visited_vals.count(inp)) {
+        vals_to_visit.push_back(inp);
+      }
+    }
+  }
+
+  return false;
+}
+
+int64_t getOperationCount(Val* val) {
+  int64_t num_ops = 0;
+
+  // Start with the given val and recursively count the number of ops
+  // by traversing inputs
+  std::deque<Val*> vals;
+  vals.push_back(val);
+
+  while (!vals.empty()) {
+    auto v = vals.front();
+    vals.pop_front();
+
+    auto def = v->definition();
+    if (def == nullptr) {
+      continue;
+    }
+    ++num_ops;
+
+    for (auto inp : def->inputs()) {
+      vals.push_back(inp);
+    }
+  }
+
+  return num_ops;
+}
+
 } // namespace nvfuser::ir_utils
 
 namespace nvfuser::MmaOpUtils {
diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h
index b02fb2fbe3e..60062b0e440 100644
--- a/csrc/ir/utils.h
+++ b/csrc/ir/utils.h
@@ -728,4 +728,13 @@ std::string nullOrToInlineString(const Statement* stmt);
 //! always returns the same result when called with the same inputs.
 bool isFunctional(const Val* v);
 
+// Check if the given val is recursively defined, which is invalid in
+// the Fusion IR but may not be necessarily the case in other IRs
+// such as the Kernel IR
+bool isRecursivelyDefined(Val* val);
+
+// Return the number of operations that are used to define val. One
+// instance of Expr is counted as a single operation.
+int64_t getOperationCount(Val* val);
+
 } // namespace nvfuser::ir_utils
diff --git a/csrc/kernel_ir.h b/csrc/kernel_ir.h
index 8be502233a5..f5f062cdbb7 100644
--- a/csrc/kernel_ir.h
+++ b/csrc/kernel_ir.h
@@ -332,12 +332,12 @@ class NVF_API Allocate final : public Expr {
   //! hold counters starting at zero. Typically, each participating thread would
   //! increment the counter and the last thread would leave the counter in a
   //! non-zeroed state. The next time that kernel is run, it can no longer
-  //! re-use the non-zero semaphore buffer, so FusionExecutor will launch
+  //! re-use the non-zero semaphore buffer, so KernelExecutor will launch
   //! at::zeroes to allocate a new buffer, resulting in a memset kernel launch.
   //!
   //! Instead, if the last thread resets the counter to zero, then the buffer
   //! can be re-used, and at::zeroes need only be run at the first kernel
-  //! launch. If resetsToZero() is true, then FusionExecutor will use
+  //! launch. If resetsToZero() is true, then KernelExecutor will use
   //! contigZeroedTensor() and releaseZeroedMemory() from global_allocator.h to
   //! reuse zeroed memory avoiding the additional kernel launch.
   //!
@@ -840,7 +840,7 @@ class NVF_API IfThenElse final : public Expr {
 //! This node is used only after lowering a fusion to explicitly mark a grid
 //! reduction and the buffer allocation needed to do it.
 //!
-//! This node provides FusionExecutor the information it needs to allocate the
+//! This node provides KernelExecutor the information it needs to allocate the
 //! reduction and sync buffers.
 class GridReduction final : public ReductionOp {
   static constexpr int num_reduction_op_attr = 4;
@@ -1004,7 +1004,7 @@ class NVF_API GroupedGridReduction final : public GroupedReductionOp {
 //! This node is used only after lowering a fusion to explicitly mark a grid
 //! broadcast and the buffer allocation needed to do it.
 //!
-//! This node provides FusionExecutor the information it needs to allocate the
+//! This node provides KernelExecutor the information it needs to allocate the
 //! broadcast and sync buffers.
 class NVF_API GridBroadcast final : public Expr {
  public:
@@ -1043,7 +1043,7 @@ class NVF_API GridBroadcast final : public Expr {
 //! This node is used only after lowering a fusion to explicitly mark a grid
 //! reduction and the buffer allocation needed to do it.
 //!
-//! This node provides FusionExecutor the information it needs to allocate the
+//! This node provides KernelExecutor the information it needs to allocate the
 //! reduction and sync buffers.
 //!
 //! TODO: Make this a subclass of WelfordOp
diff --git a/csrc/multidevice/executor.cpp b/csrc/multidevice/executor.cpp
index 6de05e48d76..6546e4555cd 100644
--- a/csrc/multidevice/executor.cpp
+++ b/csrc/multidevice/executor.cpp
@@ -6,10 +6,12 @@
  */
 // clang-format on
 #include <ATen/cuda/CUDAContext.h>
+
 #include <device_lower/utils.h>
 #include <fusion_segmenter.h>
 #include <host_ir/container.h>
 #include <host_ir/host_ir.h>
+#include <instrumentation.h>
 #include <ir/builder.h>
 #include <ir/utils.h>
 #include <multidevice/device_mesh.h>
@@ -52,6 +54,22 @@ std::unique_ptr<Fusion> copyFusionAndChangeOutputs(
   return fusion_copy;
 }
 
+// Used in distributed setting where we only want to allocate output space and
+// receive output data from a different rank instead of computing them.
+std::vector<at::Tensor> allocateOutputSpace(
+    const at::ArrayRef<c10::IValue>& inputs,
+    Fusion* fusion,
+    const c10::Device& device) {
+  FUSER_PERF_SCOPE("multidevice::executor::allocateOutputSpace");
+  auto fusion_inputs = KernelArgumentHolder::createKernelArgumentHolder(inputs);
+  auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion);
+
+  auto output_info =
+      getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs());
+
+  return allocateOutputs(fusion, output_info, device, expr_eval);
+}
+
 } // namespace
 
 MultiDeviceExecutor::MultiDeviceExecutor(
@@ -186,7 +204,7 @@ std::vector<at::Tensor> MultiDeviceExecutor::runWithInput(
   }
 
   auto allocations =
-      allocOutputSpace(inputs, allocator_fusion_.get(), comm()->device());
+      allocateOutputSpace(inputs, allocator_fusion_.get(), comm()->device());
   NVF_ERROR(vals_to_allocate_.size() == allocations.size());
   for (auto i : c10::irange(allocations.size())) {
     val_to_IValue[vals_to_allocate_.at(i)] = allocations.at(i);
diff --git a/csrc/mutator.cpp b/csrc/mutator.cpp
index 87f44c797fe..5f183bf4839 100644
--- a/csrc/mutator.cpp
+++ b/csrc/mutator.cpp
@@ -77,6 +77,18 @@ void OptOutMutator::registerMutation(Val* val, Val* mutation) {
       ", ",
       mutation->dtype(),
       ")");
+
+  NVF_ERROR(
+      !DependencyCheck::isDependencyOf(val, mutation),
+      "Attempted to replace a val, ",
+      val->toString(),
+      ", with a dependent val, ",
+      mutation->toString(),
+      " (",
+      mutation->toInlineString(),
+      "), which is not allowed as it would result in a recursive definition of ",
+      mutation->toString());
+
   mutations_[val] = mutation;
 }
 
diff --git a/csrc/options.cpp b/csrc/options.cpp
index beb98d2f8a8..54571bad3ba 100644
--- a/csrc/options.cpp
+++ b/csrc/options.cpp
@@ -135,6 +135,7 @@ std::unordered_map<DebugDumpOption, std::vector<std::string>> Options<
       {"ptx", DebugDumpOption::Ptx},
       {"ptxas_verbose", DebugDumpOption::PrintPtxasLog},
       {"python_definition", DebugDumpOption::PythonDefinition},
+      {"python_definition_segments", DebugDumpOption::PythonDefinitionSegments},
       {"python_frontend_debug", DebugDumpOption::PythonFrontendDebug},
       {"sass", DebugDumpOption::Sass},
       {"segmented_fusion", DebugDumpOption::FusionSegments},
diff --git a/csrc/options.h b/csrc/options.h
index 76522b93a9d..0cc6313a214 100644
--- a/csrc/options.h
+++ b/csrc/options.h
@@ -39,7 +39,7 @@ enum class DebugDumpOption {
   FusionIrPresched, //!< Dump the segmented Fusion IR before it is scheduled
   // TODO(wujingyue): name the following FusionIrSched
   FusionIr, //!< Dump the Fusion IR before lowering. This is the Fusion IR fed
-            //!< to `FusionExecutor::compileFusion`.
+            //!< to `KernelExecutor::compileFusion`.
   FusionIrMath, //!< Dump just the compute (math) part of the above `FusionIr`
                 //!< for conciseness
   KernelIr, //!< Dump the compiler Kernel IR
@@ -64,6 +64,7 @@ enum class DebugDumpOption {
                     //! associated with what's running
   PreSegmenterLogging,
   PythonDefinition, //! Python Frontend Fusion Definition.
+  PythonDefinitionSegments, //! Python Frontend Fusion Definition of segments.
   PythonFrontendDebug, //! Python Frontend debug information.
   TransformPropagator, //! When running TransformPropagator, print propagation
                        //! path and replay result
diff --git a/csrc/python_frontend/fusion_cache.cpp b/csrc/python_frontend/fusion_cache.cpp
index e95ee6820da..c896592af5a 100644
--- a/csrc/python_frontend/fusion_cache.cpp
+++ b/csrc/python_frontend/fusion_cache.cpp
@@ -188,7 +188,7 @@ FusionCache* FusionCache::singleton_ = nullptr;
 
 UserSchedule::UserSchedule() : scheduled_fusion(nullptr), executor(nullptr) {
   scheduled_fusion = std::make_unique<Fusion>();
-  executor = std::make_unique<FusionExecutor>();
+  executor = std::make_unique<KernelExecutor>();
 }
 
 bool UserSchedule::canSchedule(const SchedulerType& scheduler_type) {
@@ -688,7 +688,7 @@ void FusionCache::serialize(std::string filename) const {
       &fb_nodes,
       &terminal_node_idx,
       &fb_auto_gen_schedules,
-      FusionExecutor::getGlobalFusionCount(),
+      KernelExecutor::getGlobalFusionCount(),
       device_prop->major,
       device_prop->minor,
       cuda_major,
@@ -722,7 +722,7 @@ void FusionCache::deserialize(std::string filename) {
   NVF_CHECK(fusion_cache_buffer != nullptr, "Fusion Cache buffer is invalid.");
 
   // 0. Set static fusion count in Fusion Executor
-  FusionExecutor::setGlobalFusionCount(
+  KernelExecutor::setGlobalFusionCount(
       fusion_cache_buffer->global_fusion_count());
 
   // 1. Deserialize max_fusions field
diff --git a/csrc/python_frontend/fusion_cache.h b/csrc/python_frontend/fusion_cache.h
index 2d4f2533ba5..ffe8088b82c 100644
--- a/csrc/python_frontend/fusion_cache.h
+++ b/csrc/python_frontend/fusion_cache.h
@@ -41,7 +41,7 @@ struct UserSchedule {
   std::unique_ptr<Fusion> scheduled_fusion;
 
   //! Generated kernel container
-  std::unique_ptr<FusionExecutor> executor;
+  std::unique_ptr<KernelExecutor> executor;
 
   //! ID of fusion in python frontend fusion cache
   int64_t fusion_id_ = -1;
@@ -102,7 +102,7 @@ struct FusionSchedules {
   //! Keeps a pointer to the last scheduled Fusion IR for printing
   Fusion* last_user_def_scheduled_ir;
   //! Keeps a pointer to the last executed executor for printing its cuda kernel
-  FusionExecutor* last_user_def_executor;
+  KernelExecutor* last_user_def_executor;
   //! For thread-Safe locking of Fusion Schedules
   std::mutex scheds_lock;
   //! ID of fusion in python frontend fusion cache
diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
index 05f12a7c2af..dc1af7a9f5c 100644
--- a/csrc/python_frontend/fusion_definition.cpp
+++ b/csrc/python_frontend/fusion_definition.cpp
@@ -370,18 +370,18 @@ std::vector<at::Tensor> FusionDefinition::execute(
       if (user_sched.heuristic_params == nullptr) {
         // Manual schedule
         if (!user_sched.executor->isCompiled()) {
-          user_sched.executor->compileFusion(
+          user_sched.executor->compile(
               user_sched.scheduled_fusion.get(),
               inputs,
               user_sched.fusion_id_,
               user_sched.device_id_);
         }
-        outputs = user_sched.executor->runFusion(inputs);
+        outputs = user_sched.executor->run(inputs);
       } else {
         // Automatic scheduler was used for UserSchedule.
         // Pass launch and compile params to compileFusion and runFusion.
         if (!user_sched.executor->isCompiled()) {
-          user_sched.executor->compileFusion(
+          user_sched.executor->compile(
               user_sched.scheduled_fusion.get(),
               KernelArgumentHolder::createKernelArgumentHolder(
                   inputs, getCommonDeviceCUDA(inputs)),
@@ -391,7 +391,7 @@ std::vector<at::Tensor> FusionDefinition::execute(
               user_sched.fusion_id_,
               user_sched.device_id_);
         }
-        outputs = user_sched.executor->runFusion(
+        outputs = user_sched.executor->run(
             inputs,
             user_sched.heuristic_params->lparams,
             user_sched.heuristic_params->cparams);
diff --git a/csrc/python_frontend/fusion_record.h b/csrc/python_frontend/fusion_record.h
index 154f8d28805..8d9f77c80bf 100644
--- a/csrc/python_frontend/fusion_record.h
+++ b/csrc/python_frontend/fusion_record.h
@@ -1823,7 +1823,7 @@ struct SelectOpRecord : RecordFunctor {
 
   void operator()(FusionState& fd) final {
     auto arg1 = fd.getFusionState(args_.at(0).index)->template as<TensorView>();
-    auto arg3 = fd.getFusionState(args_.at(1).index)->template as<TensorView>();
+    auto arg3 = fd.getFusionState(args_.at(1).index);
 
     Val* output = select(arg1, dim_, arg3);
     fd.setFusionState(outputs_.at(0).index, output);
diff --git a/csrc/python_frontend/python_bindings.cpp b/csrc/python_frontend/python_bindings.cpp
index bf092240b06..f17ea228ad0 100644
--- a/csrc/python_frontend/python_bindings.cpp
+++ b/csrc/python_frontend/python_bindings.cpp
@@ -651,7 +651,6 @@ void defineHeuristicParamBindings(py::module& nvfuser) {
       .PARAM(MatmulParams, circular_buffer_options)
       .PARAM(MatmulParams, supported_vec_size)
       .PARAM(MatmulParams, async_gmem_load_operands)
-      .PARAM(MatmulParams, rotate_ldmatrix_out_of_main_loop)
       .PARAM(MatmulParams, grid_swizzle_factor)
       .PARAM(MatmulParams, use_smem_epilogue)
       .PARAM(MatmulParams, promote_prologue_smem_reuse)
diff --git a/csrc/runtime/allocations.cpp b/csrc/runtime/allocations.cpp
index f482bf6dfb4..29fa52461e6 100644
--- a/csrc/runtime/allocations.cpp
+++ b/csrc/runtime/allocations.cpp
@@ -366,20 +366,6 @@ std::vector<at::Tensor> allocateOutputs(
   return out_tensors;
 }
 
-std::vector<at::Tensor> allocOutputSpace(
-    const at::ArrayRef<c10::IValue>& inputs,
-    Fusion* fusion,
-    const c10::Device& device) {
-  FUSER_PERF_SCOPE("fusion_executor::allocations::allocOutputSpace");
-  auto fusion_inputs = KernelArgumentHolder::createKernelArgumentHolder(inputs);
-  auto expr_eval = executor_utils::bindInputs(fusion_inputs, fusion);
-
-  auto output_info =
-      getBufferInfos(expr_eval, PrimDataType::Int, fusion->outputs());
-
-  return allocateOutputs(fusion, output_info, device, expr_eval);
-}
-
 namespace {
 GlobalBufferInfo getBufferInfo(
     ExpressionEvaluator& expr_eval,
@@ -685,12 +671,11 @@ class BackwardTraverseFromAllocToLogical {
 // Another example, if the logical domain is [I1*I2] and the allocation domain
 // is [I1, I2], then we will allocate as [I1, I2] and do a tensor.view(I1*I2) to
 // get a tensor whose semantics is [I1*I2] but memory is [I1,I2]
-at::Tensor transformOutputFromAllocationToLogical(
+at::Tensor transformFromAllocationToLogical(
     at::Tensor tensor,
     TensorView* tv,
     ExpressionEvaluator& ee) {
-  FUSER_PERF_SCOPE(
-      "fusion_executor::allocations::transformOutputFromAllocationToLogical");
+  FUSER_PERF_SCOPE("allocations::transformFromAllocationToLogical");
   // Ignore reductions because reductions does not exist in tensor's definition
   auto logical = TensorDomain::noReductions(tv->getLogicalDomain());
   auto alloc = TensorDomain::noReductions(tv->getMaybeAllocationDomain());
@@ -765,9 +750,8 @@ std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
       at::empty_strided(size_stride.first, size_stride.second, options);
   // TODO(jiej): we should refactor it here, there's no need to use
   // meta_tensor at all, size + stride should be used directly in the
-  // `transformOutputFromAllocationToLogical`
-  meta_tensor =
-      transformOutputFromAllocationToLogical(meta_tensor, tv, expr_eval);
+  // `transformFromAllocationToLogical`
+  meta_tensor = transformFromAllocationToLogical(meta_tensor, tv, expr_eval);
   return {meta_tensor.sizes().vec(), meta_tensor.strides().vec()};
 }
 
diff --git a/csrc/runtime/allocations.h b/csrc/runtime/allocations.h
index 294013f4e1a..1ec77eb3ce2 100644
--- a/csrc/runtime/allocations.h
+++ b/csrc/runtime/allocations.h
@@ -56,14 +56,6 @@ NVF_API void setFillAllocationWithNan(bool value);
 
 void fillTensorWithNan(at::Tensor& t);
 
-//! Used in distributed setting where we only want to
-//!  allocate output space and receive output data from
-//!  a different rank instead of computing them.
-std::vector<at::Tensor> allocOutputSpace(
-    const at::ArrayRef<c10::IValue>& inputs,
-    Fusion* fusion,
-    const c10::Device& device);
-
 // Infer the sizes and strides of an output tensor
 std::pair<std::vector<int64_t>, std::vector<int64_t>> inferShapeOfOutput(
     TensorView* tv,
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 2acc3001822..a7ded48889a 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -138,10 +138,10 @@ std::string getStructuredCodeFromExternalFiles(const int64_t fusion_id) {
 }
 } // namespace
 
-FusionExecutor::FusionExecutor()
+KernelExecutor::KernelExecutor()
     : communicator_(&Communicator::getInstance()) {}
 
-std::unique_ptr<PrecomputedValues>& FusionExecutor::
+std::unique_ptr<PrecomputedValues>& KernelExecutor::
     evaluatorPrecomputedValues() {
   if (!evaluator_precomputed_values_) {
     evaluator_precomputed_values_ =
@@ -150,7 +150,7 @@ std::unique_ptr<PrecomputedValues>& FusionExecutor::
   return evaluator_precomputed_values_;
 }
 
-std::string FusionExecutor::getStructuredCode(
+std::string KernelExecutor::getStructuredCode(
     const std::string& kernel_str,
     PrimDataType index_type) const {
   // generating cuda code;
@@ -181,11 +181,11 @@ std::string FusionExecutor::getStructuredCode(
   return code;
 }
 
-std::string FusionExecutor::getStructuredCode() const {
+std::string KernelExecutor::getStructuredCode() const {
   return getStructuredCode(kernelString(), kernel()->indexType());
 }
 
-void FusionExecutor::compileFusion(
+void KernelExecutor::compile(
     Fusion* fusion,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
@@ -195,7 +195,7 @@ void FusionExecutor::compileFusion(
     int64_t concrete_id,
     int64_t runtime_id,
     int64_t group_id) {
-  FUSER_PERF_SCOPE("FusionExecutor::compileFusion");
+  FUSER_PERF_SCOPE("KernelExecutor::compileFusion");
 
   NVF_ERROR(
       !fusion->outputs().empty(), "No output found for this kernel, aborting.");
@@ -456,7 +456,7 @@ void FusionExecutor::compileFusion(
       kernel_id_,
       compile_params,
       block_size);
-  NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor.");
+  NVF_ERROR(validKernelId(), "Invalid kernel id for KernelExecutor.");
 
   // These should be nullopt at this point, but reset just in case
   resetCompiledKernelProperties();
@@ -475,12 +475,12 @@ void FusionExecutor::compileFusion(
   }
 }
 
-LaunchParams FusionExecutor::computeLaunchParams(
+LaunchParams KernelExecutor::computeLaunchParams(
     const LaunchParams& launch_constraints,
     ExpressionEvaluator& expr_eval,
     const int64_t warp_size,
     DataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::computeLaunchParams");
+  FUSER_PERF_SCOPE("KernelExecutor::computeLaunchParams");
   NVF_ERROR(warp_size > 0, "WARP_SIZE should be larger than 0");
 
   LaunchParams launch_params;
@@ -555,7 +555,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
 
   // Run through the rest of the parallel IterDomains and infer their size
   for (auto [p_type, extent] : simplified_parallel_iter_extents) {
-    FUSER_PERF_SCOPE("FusionExecutor::ParallelBindingResolution");
+    FUSER_PERF_SCOPE("KernelExecutor::ParallelBindingResolution");
     auto val = expr_eval.evaluate(extent);
     NVF_ERROR(
         val.hasValue(),
@@ -635,10 +635,10 @@ LaunchParams FusionExecutor::computeLaunchParams(
   return launch_params;
 }
 
-std::vector<GlobalBufferInfo> FusionExecutor::getIntermediateBufferInfo(
+std::vector<GlobalBufferInfo> KernelExecutor::getIntermediateBufferInfo(
     ExpressionEvaluator& expr_eval,
     DataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::getIntermediateBufferInfo");
+  FUSER_PERF_SCOPE("KernelExecutor::getIntermediateBufferInfo");
   std::vector<GlobalBufferInfo> global_buffers;
 
   const auto kernel = lowered_->kernel();
@@ -685,7 +685,7 @@ std::vector<GlobalBufferInfo> FusionExecutor::getIntermediateBufferInfo(
   return global_buffers;
 }
 
-void FusionExecutor::setUsedTVs() {
+void KernelExecutor::setUsedTVs() {
   auto used_vals = fusion()->usedMathVals();
   auto used_tvs = ir_utils::filterByType<TensorView>(used_vals);
   used_tvs_.clear();
@@ -744,7 +744,7 @@ void validateCooperativeLaunch(
 
 // Dump fusion inputs and outputs as well as some useful fusion
 // information. Note that inputs and outputs are those that are passed
-// to FusionExecutor::runFusion, so outputs may not be given.
+// to KernelExecutor::runFusion, so outputs may not be given.
 void dumpFusionArgs(
     int64_t fusion_id,
     const KernelArgumentHolder& args,
@@ -768,7 +768,7 @@ void dumpFusionArgs(
 // Dump arguments that are passed to a CUDA kernel call, which include
 // the inputs and outputs of the fusion as well as temporary
 // global-memory buffers. Unlike dumpFusionArgs, which dumps inputs
-// and outputs passed to FusionExecutor::runFusion, this function
+// and outputs passed to KernelExecutor::runFusion, this function
 // dumps those that are passed to a CUDA kernel.
 void dumpKernelArgs(
     int64_t fusion_id,
@@ -803,14 +803,14 @@ void dumpKernelArgs(
 
 } // namespace
 
-void FusionExecutor::initializeExecutorEntry(
+void KernelExecutor::initializeExecutorEntry(
     ExecutorEntry& executor_entry,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     const CompileParams& compile_params,
     const std::vector<at::Tensor>& outputs,
     DataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::initializeExecutorEntry");
+  FUSER_PERF_SCOPE("KernelExecutor::initializeExecutorEntry");
 
   ExpressionEvaluator expr_eval;
   evaluatorPrecomputedValues()->bindInputs(args);
@@ -882,7 +882,7 @@ void FusionExecutor::initializeExecutorEntry(
 /// @param idx_type_size generally sizeof(int32_t) or sizeof(int64_t); used for
 ///                      computing how large the arrays to copy are.
 static void fillTensorArgMetadata(
-    FusionExecutor::ExecutorEntry& entry,
+    KernelExecutor::ExecutorEntry& entry,
     const PolymorphicValue& tensor_metadata,
     size_t idx,
     size_t idx_type_size) {
@@ -943,11 +943,11 @@ static void fillTensorArgMetadata(
 // when we change the rank of a tensor or the number of arguments to a kernel.
 // It does not need to happen when only shapes change---use recomputeArgs for
 // that.
-void FusionExecutor::computeArgs(
+void KernelExecutor::computeArgs(
     ExecutorEntry& entry,
     ExpressionEvaluator& expr_eval,
     const kir::Kernel* kernel) const {
-  FUSER_PERF_SCOPE("FusionExecutor::computeArgs");
+  FUSER_PERF_SCOPE("KernelExecutor::computeArgs");
 
   const std::vector<Val*>& params = kernel->parameters();
   entry.args.resize(params.size());
@@ -961,11 +961,11 @@ void FusionExecutor::computeArgs(
 
 // Reset the arguments that we'll pass to cuLaunchKernel. This needs to be
 // invoked on every shape change.
-void FusionExecutor::recomputeArgs(
+void KernelExecutor::recomputeArgs(
     ExecutorEntry& entry,
     ExpressionEvaluator& expr_eval,
     const kir::Kernel* kernel) const {
-  FUSER_PERF_SCOPE("FusionExecutor::recomputeArgs");
+  FUSER_PERF_SCOPE("KernelExecutor::recomputeArgs");
   // assert(entry.init && "entry was never initialized");
 
   const std::vector<Val*>& params = kernel->parameters();
@@ -996,10 +996,10 @@ void FusionExecutor::recomputeArgs(
   }
 }
 
-void FusionExecutor::recompileKernel(
+void KernelExecutor::recompileKernel(
     const LaunchParams& new_launch_params,
     const CompileParams& new_compile_params) {
-  FUSER_PERF_SCOPE("FusionExecutor::runFusion::recompileKernel");
+  FUSER_PERF_SCOPE("KernelExecutor::runFusion::recompileKernel");
 
   const auto structured_code = getStructuredCode();
   block_size_high_water_mark_ = new_launch_params.nThreads();
@@ -1026,7 +1026,7 @@ void FusionExecutor::recompileKernel(
   }
 }
 
-int64_t FusionExecutor::getAvailableDynamicSmemSize() {
+int64_t KernelExecutor::getAvailableDynamicSmemSize() {
   NVF_ERROR(
       hasCompiledKernel(),
       "Cannot get dynamic smem size unless kernel is compiled");
@@ -1041,7 +1041,7 @@ int64_t FusionExecutor::getAvailableDynamicSmemSize() {
   return available_dynamic_smem_size_.value();
 }
 
-int64_t FusionExecutor::getStaticSmemSize() {
+int64_t KernelExecutor::getStaticSmemSize() {
   NVF_ERROR(
       hasCompiledKernel(),
       "Cannot get static smem size unless kernel is compiled");
@@ -1057,7 +1057,7 @@ int64_t FusionExecutor::getStaticSmemSize() {
   return static_smem_size_.value();
 }
 
-void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
+void KernelExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
   // If specified, check that dynamic smem size matches what the scheduler
   // expects
   int64_t expected_dynamic_smem_size = fusion()->expectedDynamicSmemBytes();
@@ -1082,7 +1082,7 @@ void FusionExecutor::validateDynamicSmemSize(int64_t dynamic_smem_size) {
       device_smem_limit_);
 }
 
-int64_t FusionExecutor::ensureAvailableDynamicSmemSize(
+int64_t KernelExecutor::ensureAvailableDynamicSmemSize(
     int64_t dynamic_smem_size) {
   NVF_ERROR(
       hasCompiledKernel(),
@@ -1098,15 +1098,15 @@ int64_t FusionExecutor::ensureAvailableDynamicSmemSize(
   return getAvailableDynamicSmemSize();
 }
 
-void FusionExecutor::resetCompiledKernelProperties() {
+void KernelExecutor::resetCompiledKernelProperties() {
   available_dynamic_smem_size_.reset();
   static_smem_size_.reset();
 }
 
-std::vector<at::Tensor> FusionExecutor::evaluateFusionOutputs(
+std::vector<at::Tensor> KernelExecutor::evaluateFusionOutputs(
     std::vector<at::Tensor> outputs,
     ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluateFusionOutputs");
+  FUSER_PERF_SCOPE("KernelExecutor::runFusion::evaluateFusionOutputs");
   NVF_ERROR(
       outputs.empty(),
       "Fusion executor is using expression evaluator,",
@@ -1137,12 +1137,12 @@ at::Tensor findBufferForFusionOutput(
 }
 } // namespace
 
-std::vector<at::Tensor> FusionExecutor::runFusion(
+std::vector<at::Tensor> KernelExecutor::run(
     KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     CompileParams compile_params,
     std::vector<at::Tensor> outputs) {
-  FUSER_PERF_SCOPE("FusionExecutor::runFusion");
+  FUSER_PERF_SCOPE("KernelExecutor::runFusion");
 
   if (isProfilerEnabled()) {
     NVF_CHECK(
@@ -1165,7 +1165,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   auto expr_eval = executor_utils::bindInputs(args, fusion());
 
   if (isExpressionEvaluated(fusion())) {
-    FUSER_PERF_SCOPE("FusionExecutor::runFusion::evaluate_with_ExprEval");
+    FUSER_PERF_SCOPE("KernelExecutor::runFusion::evaluate_with_ExprEval");
     outputs = evaluateFusionOutputs(outputs, expr_eval);
     if (isProfilerEnabled()) {
       auto& sprof = FusionProfiler::segment(group_id_);
@@ -1176,7 +1176,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   }
 
   if (host_ir_container_ != nullptr) {
-    FUSER_PERF_SCOPE("FusionExecutor::runFusion::host_ir_evaluate");
+    FUSER_PERF_SCOPE("KernelExecutor::runFusion::host_ir_evaluate");
     if (outputs.empty()) {
       std::vector<GlobalBufferInfo> output_info = getBufferInfos(
           expr_eval, PrimDataType::Int, host_ir_container_->outputs());
@@ -1204,7 +1204,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     return outputs;
   }
 
-  NVF_ERROR(validKernelId(), "Invalid kernel id for FusionExecutor.");
+  NVF_ERROR(validKernelId(), "Invalid kernel id for KernelExecutor.");
   NVF_ERROR(
       !args.getCacheId().has_value() || outputs.empty(),
       "short cut input cache is not compatible with pre-allocated output");
@@ -1276,7 +1276,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   std::vector<at::Tensor> intermediates;
   at::Tensor profile_buffer;
   {
-    FUSER_PERF_SCOPE("FusionExecutor::runFusion::intermediates");
+    FUSER_PERF_SCOPE("KernelExecutor::runFusion::intermediates");
     for (const auto i : c10::irange(executor_entry->intermediates.size())) {
       const auto& buf_info = executor_entry->intermediates.at(i);
       bool has_expansion = false;
@@ -1356,7 +1356,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   executor_utils::CudaKernelTimer timer(stream);
 
   if (execute_kernel_ && !kernel()->topLevelExprs().empty()) {
-    FUSER_PERF_SCOPE("FusionExecutor::runFusion::execute_kernel");
+    FUSER_PERF_SCOPE("KernelExecutor::runFusion::execute_kernel");
     ensureAvailableDynamicSmemSize(executor_entry->launch_params.smem());
 
     recomputeArgs(*executor_entry, expr_eval, kernel());
@@ -1435,7 +1435,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   return outputs;
 }
 
-int64_t FusionExecutor::inputBytesProcessed(const KernelArgumentHolder& args) {
+int64_t KernelExecutor::inputBytesProcessed(const KernelArgumentHolder& args) {
   int64_t num_bytes = 0;
   // Figure how many bytes are inputs, outputs, and temporary buffers
   for (auto i : c10::irange(args.size())) {
@@ -1447,7 +1447,7 @@ int64_t FusionExecutor::inputBytesProcessed(const KernelArgumentHolder& args) {
   return num_bytes;
 }
 
-int64_t FusionExecutor::outputBytesProcessed(
+int64_t KernelExecutor::outputBytesProcessed(
     const std::vector<at::Tensor>& outputs) {
   int64_t num_bytes = 0;
   for (auto i : c10::irange(outputs.size())) {
@@ -1459,12 +1459,12 @@ int64_t FusionExecutor::outputBytesProcessed(
   return num_bytes;
 }
 
-void FusionExecutor::compileRtc(
+void KernelExecutor::compileRtc(
     const std::string& code,
     const std::string& name,
     bool structured,
     PrimDataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::compileRtc");
+  FUSER_PERF_SCOPE("KernelExecutor::compileRtc");
   NVF_ERROR(
       index_type == PrimDataType::Int || index_type == PrimDataType::Int32 ||
           "Invalid index type: ",
@@ -1482,11 +1482,11 @@ void FusionExecutor::compileRtc(
       executor_utils::getCompiledKernel(std::nullopt, scode, name, kernel_id_);
 }
 
-float FusionExecutor::runRtc(
+float KernelExecutor::runRtc(
     const LaunchParams& launch_params,
     const std::vector<at::Tensor>& args,
     PrimDataType index_type) {
-  FUSER_PERF_SCOPE("FusionExecutor::runRtc");
+  FUSER_PERF_SCOPE("KernelExecutor::runRtc");
 
   c10::DeviceGuard dg(options_.device);
   auto stream = at::cuda::getCurrentCUDAStream();
@@ -1547,9 +1547,9 @@ float FusionExecutor::runRtc(
   return kernel_time_ms;
 }
 
-flatbuffers::Offset<serde::FusionExecutor> FusionExecutor::serialize(
+flatbuffers::Offset<serde::KernelExecutor> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder) const {
-  // See table definition for FusionExecutor in serde/fusion_cache.fbs
+  // See table definition for KernelExecutor in serde/fusion_cache.fbs
   using fb_executor_entry = flatbuffers::Offset<serde::ExecutorEntry>;
 
   // Separate unordered_map for executor_entry_lookup into key and value
@@ -1564,10 +1564,10 @@ flatbuffers::Offset<serde::FusionExecutor> FusionExecutor::serialize(
   // When compilation is skipped, avoid serializing cubin because it doesn't
   // exist. The remaining fields are also not necessary in this case.
   if (!hasCompiledKernel()) {
-    return serde::CreateFusionExecutorDirect(builder);
+    return serde::CreateKernelExecutorDirect(builder);
   }
 
-  return serde::CreateFusionExecutorDirect(
+  return serde::CreateKernelExecutorDirect(
       builder,
       device_smem_limit_,
       block_size_high_water_mark_,
@@ -1585,13 +1585,13 @@ flatbuffers::Offset<serde::FusionExecutor> FusionExecutor::serialize(
       serialize(builder, compiled_kernel_.get()));
 }
 
-flatbuffers::Offset<serde::CudaKernel> FusionExecutor::serialize(
+flatbuffers::Offset<serde::CudaKernel> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
     const executor_utils::CompiledKernel* compiled_kernel) const {
   NVF_ERROR(
       compiled_kernel_ != nullptr &&
           (!compiled_kernel->cubin.empty() || !compiled_kernel->ptx.empty()),
-      "Expected compiled cuda kernel before serializing FusionExecutor.");
+      "Expected compiled cuda kernel before serializing KernelExecutor.");
 
   auto fb_kernel_name = builder.CreateString(compiled_kernel->kernel_name);
   auto fb_compile_args = builder.CreateString(compiled_kernel->compile_args);
@@ -1631,7 +1631,7 @@ flatbuffers::Offset<serde::CudaKernel> FusionExecutor::serialize(
   return ckb.Finish();
 }
 
-flatbuffers::Offset<serde::ExecutorEntry> FusionExecutor::serialize(
+flatbuffers::Offset<serde::ExecutorEntry> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
     const ExecutorEntry& data) const {
   // See table definition for ExecutorEntry in serde/fusion_cache.fbs
@@ -1683,7 +1683,7 @@ flatbuffers::Offset<serde::ExecutorEntry> FusionExecutor::serialize(
       &intermediates_fb);
 }
 
-flatbuffers::Offset<serde::GlobalBufferInfo> FusionExecutor::serialize(
+flatbuffers::Offset<serde::GlobalBufferInfo> KernelExecutor::serialize(
     flatbuffers::FlatBufferBuilder& builder,
     const GlobalBufferInfo& data,
     int64_t tv_position,
@@ -1701,8 +1701,8 @@ flatbuffers::Offset<serde::GlobalBufferInfo> FusionExecutor::serialize(
       is_fusion_output);
 }
 
-void FusionExecutor::deserialize(
-    const serde::FusionExecutor* buffer,
+void KernelExecutor::deserialize(
+    const serde::KernelExecutor* buffer,
     Fusion* fusion,
     int8_t device_index,
     CompileParams compile_params,
@@ -1711,15 +1711,15 @@ void FusionExecutor::deserialize(
     int64_t concrete_id,
     int64_t runtime_id,
     int64_t group_id) {
-  // See table definition for FusionExecutor in serde/fusion_cache.fbs
+  // See table definition for KernelExecutor in serde/fusion_cache.fbs
 
-  NVF_ERROR(buffer != nullptr, "serde::FusionExecutor is nullptr.");
+  NVF_ERROR(buffer != nullptr, "serde::KernelExecutor is nullptr.");
 
   // TODO Should we set fusion_id, concrete_id, runtime_id, and group_id when we
   // skip compilation?
   if (isExpressionEvaluated(fusion)) {
     fusion_ = std::make_unique<Fusion>(*fusion);
-    NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize FusionExecutor");
+    NVF_ERROR(!hasCompiledKernel(), "Failed to deserialize KernelExecutor");
     return;
   }
 
@@ -1781,10 +1781,10 @@ void FusionExecutor::deserialize(
   compiled_kernel_ = executor_utils::getCompiledKernel(
       buffer->compiled_kernel(), compile_params);
 
-  NVF_ERROR(hasCompiledKernel(), "Failed to deserialize FusionExecutor");
+  NVF_ERROR(hasCompiledKernel(), "Failed to deserialize KernelExecutor");
 }
 
-FusionExecutor::ExecutorEntry FusionExecutor::deserialize(
+KernelExecutor::ExecutorEntry KernelExecutor::deserialize(
     const serde::ExecutorEntry* buffer) {
   // See table definition for ExecutorEntry in serde/fusion_cache.fbs
 
@@ -1807,7 +1807,7 @@ FusionExecutor::ExecutorEntry FusionExecutor::deserialize(
   return entry;
 }
 
-GlobalBufferInfo FusionExecutor::deserialize(
+GlobalBufferInfo KernelExecutor::deserialize(
     const serde::GlobalBufferInfo* buffer) {
   // See table definition for GlobalBufferInfo in serde/fusion_cache.fbs
 
diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
index fd68acc3baf..0b6d27fb752 100644
--- a/csrc/runtime/executor.h
+++ b/csrc/runtime/executor.h
@@ -34,15 +34,15 @@ struct CompileOptions {
   c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
 };
 
-class FusionExecutor : public NonCopyable {
+class KernelExecutor : public NonCopyable {
  public:
   // NVF_API was added for nvfuser_extension. See examples/sinh_extension.
-  NVF_API FusionExecutor();
+  NVF_API KernelExecutor();
 
   //! To compile a fusion with the 32-bit index type, CompileParams
   //! must be passed in. There used to be an index type associated
   //! with KernelArgumentHolder, but it is no longer the case.
-  NVF_API void compileFusion(
+  NVF_API void compile(
       Fusion* fusion,
       const KernelArgumentHolder& args,
       const LaunchParams& launch_constraints,
@@ -56,25 +56,25 @@ class FusionExecutor : public NonCopyable {
   // TODO: merge it with the overload above.
   //! This API is merely here so we don't have to go back and update all cpp
   //! tests.
-  void compileFusion(
+  void compile(
       Fusion* fusion,
       const at::ArrayRef<c10::IValue>& inputs = {},
       const LaunchParams& launch_constraints = LaunchParams(),
       CompileParams compile_params = CompileParams()) {
     KernelArgumentHolder args =
         KernelArgumentHolder::createKernelArgumentHolder(inputs);
-    compileFusion(fusion, args, launch_constraints, compile_params);
+    compile(fusion, args, launch_constraints, compile_params);
   }
 
   //! Used by user defined schedules in python frontend
-  void compileFusion(
+  void compile(
       Fusion* fusion,
       const at::ArrayRef<c10::IValue>& inputs,
       int64_t fusion_id,
       int64_t concrete_id) {
     KernelArgumentHolder args =
         KernelArgumentHolder::createKernelArgumentHolder(inputs);
-    compileFusion(
+    compile(
         fusion,
         args,
         LaunchParams(),
@@ -92,15 +92,15 @@ class FusionExecutor : public NonCopyable {
   // TODO: args shouldn't come in a reference here because we will append the
   // outputs to be able to send it to the kernel. For now none of the users are
   // reconsuming the args, so it is okay. It isn't done now because changing it
-  // from a reference makes a call as runFusion({}) ambiguous, and that is used
+  // from a reference makes a call as run({}) ambiguous, and that is used
   // in some places in the codebase.
-  NVF_API std::vector<at::Tensor> runFusion(
+  NVF_API std::vector<at::Tensor> run(
       KernelArgumentHolder& args,
       const LaunchParams& launch_constraints = LaunchParams(),
       CompileParams compile_params = CompileParams(),
       std::vector<at::Tensor> outputs = {});
 
-  std::vector<at::Tensor> runFusion(
+  std::vector<at::Tensor> run(
       const at::ArrayRef<c10::IValue>& inputs,
       const std::vector<at::Tensor>& outputs,
       const LaunchParams& launch_constraints = LaunchParams(),
@@ -111,15 +111,15 @@ class FusionExecutor : public NonCopyable {
     if (opt_code.has_value()) {
       args.setCacheId(*opt_code);
     }
-    return runFusion(args, launch_constraints, compile_params, outputs);
+    return run(args, launch_constraints, compile_params, outputs);
   }
 
-  std::vector<at::Tensor> runFusion(
+  std::vector<at::Tensor> run(
       const at::ArrayRef<c10::IValue>& inputs,
       const LaunchParams& launch_constraints = LaunchParams(),
       CompileParams compile_params = CompileParams(),
       const std::optional<size_t>& opt_code = std::nullopt) {
-    return runFusion(inputs, {}, launch_constraints, compile_params, opt_code);
+    return run(inputs, {}, launch_constraints, compile_params, opt_code);
   }
 
   // Register a lowering hooks that are called to modify the GpuLower object
@@ -135,7 +135,7 @@ class FusionExecutor : public NonCopyable {
     post_lowering_hooks_.push_back(std::move(hook));
   }
 
-  // Function to query whether compilation was attempted for a `FusionExecutor`
+  // Function to query whether compilation was attempted for a `KernelExecutor`
   bool isCompiled() const {
     int num_compiled_artifacts = (fusion_ != nullptr) + (lowered_ != nullptr) +
         (host_ir_container_ != nullptr);
@@ -143,7 +143,7 @@ class FusionExecutor : public NonCopyable {
     return num_compiled_artifacts == 1;
   };
 
-  // function to query whether a `FusionExecutor` has a compiled kernel to
+  // function to query whether a `KernelExecutor` has a compiled kernel to
   // execute
   bool hasCompiledKernel() const {
     if (compiled_kernel_ != nullptr) {
@@ -355,12 +355,12 @@ class FusionExecutor : public NonCopyable {
   }
 
   //! Serialize Fusion Executor using flatbuffers
-  flatbuffers::Offset<serde::FusionExecutor> serialize(
+  flatbuffers::Offset<serde::KernelExecutor> serialize(
       flatbuffers::FlatBufferBuilder& builder) const;
 
   //! Deserialize Fusion Executor using flatbuffers
   void deserialize(
-      const serde::FusionExecutor* buffer,
+      const serde::KernelExecutor* buffer,
       Fusion* fusion,
       int8_t device_index,
       CompileParams compile_params,
@@ -428,9 +428,9 @@ class FusionExecutor : public NonCopyable {
       flatbuffers::FlatBufferBuilder& builder,
       const executor_utils::CompiledKernel* kernel) const;
 
-  // ExecutorEntry is an internal POD struct for the FusionExecutor class.
+  // ExecutorEntry is an internal POD struct for the KernelExecutor class.
   // We define ExecutorEntry's serialize and deserialize as private methods in
-  // FusionExecutor.
+  // KernelExecutor.
   flatbuffers::Offset<serde::ExecutorEntry> serialize(
       flatbuffers::FlatBufferBuilder& builder,
       const ExecutorEntry& data) const;
@@ -438,9 +438,9 @@ class FusionExecutor : public NonCopyable {
   //! Deserialize ExecutorEntry using flatbuffers
   ExecutorEntry deserialize(const serde::ExecutorEntry* buffer);
 
-  // GlobalBufferInfo is an internal POD struct for the FusionExecutor class.
+  // GlobalBufferInfo is an internal POD struct for the KernelExecutor class.
   // We define GlobalBufferInfo's serialize and deserialize as private methods
-  // in FusionExecutor.
+  // in KernelExecutor.
   flatbuffers::Offset<serde::GlobalBufferInfo> serialize(
       flatbuffers::FlatBufferBuilder& builder,
       const GlobalBufferInfo& data,
diff --git a/csrc/runtime/executor_utils.cpp b/csrc/runtime/executor_utils.cpp
index bb1c9b59f63..565f823d263 100644
--- a/csrc/runtime/executor_utils.cpp
+++ b/csrc/runtime/executor_utils.cpp
@@ -689,7 +689,7 @@ void validateVectorizedTensors(
     const std::vector<at::Tensor>& outputs,
     caching::ExecutorCompileTimeInfoCache* data_cache,
     ExpressionEvaluator& expr_eval) {
-  FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors");
+  FUSER_PERF_SCOPE("KernelExecutor::validateVectorizedTensors");
 
   validateAlignedVectorizedTensors(
       kernel, args, outputs, data_cache, expr_eval);
diff --git a/csrc/runtime/executor_utils.h b/csrc/runtime/executor_utils.h
index 8e99129356c..6c8418dc200 100644
--- a/csrc/runtime/executor_utils.h
+++ b/csrc/runtime/executor_utils.h
@@ -77,7 +77,7 @@ namespace caching {
 //  the logic in the common space and re-use
 
 //! List of all the possible entry types in
-//!  `FusionExecutor` compile-time data cache.
+//!  `KernelExecutor` compile-time data cache.
 enum class CompileTimeEntryType {
   PARALLEL_BINDING_ITERDOMAINS,
   PARALLEL_ITER_EXTENT_MAP,
@@ -91,7 +91,7 @@ enum class CompileTimeEntryType {
 //! Entry class definitions for each entry type:
 //!  each class defines the data type for each entry type
 
-//! Compile-time info to be cached in each FusionExecutor:
+//! Compile-time info to be cached in each KernelExecutor:
 //!  ParallelBindingIterDomains:
 //!    Stores all the iterdomains that are parallelized
 //!    on the scheduled Fusion graph. They will be used
@@ -104,7 +104,7 @@ class ParallelBindingIterDomains {
       CompileTimeEntryType::PARALLEL_BINDING_ITERDOMAINS;
 };
 
-//! Compile-time info to be cached in each FusionExecutor:
+//! Compile-time info to be cached in each KernelExecutor:
 //!  ParallelIterExtentMap
 //!    Stores the symbolic extents of all the parallelized
 //!    iterdomains corresponding to each used parallel type.
@@ -132,7 +132,7 @@ struct VectorizedTensorInfo {
   std::vector<int64_t> out_misaligned_tensors_pos;
 };
 
-//! Compile-time info to be cached in each FusionExecutor:
+//! Compile-time info to be cached in each KernelExecutor:
 //!  VectorizedTensorValidation
 //!    Stores position info and vector word sizes of
 //!    vectorized input/output tensors, to be used
diff --git a/csrc/runtime/fusion_cache_utils.h b/csrc/runtime/fusion_cache_utils.h
index a61f1844343..452192b11b8 100644
--- a/csrc/runtime/fusion_cache_utils.h
+++ b/csrc/runtime/fusion_cache_utils.h
@@ -28,7 +28,7 @@ class SegmentedFusion;
 // Utilities for benchmarking and profiling
 struct ExecutorLog {
   std::unique_ptr<HeuristicParams> params = nullptr;
-  FusionExecutor* fusion_executor = nullptr;
+  KernelExecutor* fusion_executor = nullptr;
 };
 
 struct RuntimeWorkSpace {
@@ -153,7 +153,7 @@ class InputsIdLookup : public NonCopyable {
   //! Encode each input sets to with an unique id;
   //! The returned data structure also indicates whether eviction has happened
   //! within the lookup cache. This is needed because lookup shortcut is also
-  //! cached in nested `FusionExecutorCache` and `FusionExecutor`.
+  //! cached in nested `FusionExecutorCache` and `KernelExecutor`.
   //! see [ Note -- Post-definition cache implementation ] and [ Note -- 2 level
   //! cache implementation ].
   //!
diff --git a/csrc/runtime/fusion_executor_cache.cpp b/csrc/runtime/fusion_executor_cache.cpp
index a3b8de148e2..af9c83da2b6 100644
--- a/csrc/runtime/fusion_executor_cache.cpp
+++ b/csrc/runtime/fusion_executor_cache.cpp
@@ -33,9 +33,6 @@
 #include <scheduler/registry.h>
 #include <utils.h>
 
-#include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/runtime/graph_executor.h>
-
 namespace nvfuser {
 
 FusionExecutorCache::FusionExecutorCache(
@@ -81,15 +78,7 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
         " failed");
   }
 
-  int seq_id = 0;
-  // Record kernel input and output tensors so profiler can construct
-  // the data flow graph
-  RECORD_FUNCTION(
-      "run_fused_kernel",
-      std::vector<c10::IValue>(inputs.begin(), inputs.end()),
-      seq_id);
   auto outputs = kernel_runtime->runWithInputs(args);
-  RECORD_OUTPUTS(outputs);
 
   // Kernel time measurement is off by default
   kernel_runtime->disableKernelTimeMeasurement();
@@ -194,8 +183,8 @@ std::string FusionExecutorCache::getCode(
 
   if (intrinsic_code) {
     const auto& execs = kernel_runtime->executors();
-    const FusionExecutor& fe = execs[0];
-    auto index_type = fe.kernel()->indexType();
+    const KernelExecutor& ke = execs[0];
+    auto index_type = ke.kernel()->indexType();
     // Make sure all the segment index types match. All segments currently
     // use the same index type but this code change in the future.
     for (const auto& exec : execs) {
@@ -206,7 +195,7 @@ std::string FusionExecutorCache::getCode(
           " ",
           exec.kernel()->indexType());
     }
-    std::string full_code = fe.getStructuredCode(kernel_code, index_type);
+    std::string full_code = ke.getStructuredCode(kernel_code, index_type);
     return full_code;
   } else {
     return kernel_code;
@@ -492,7 +481,7 @@ void FusionExecutorCache::deserialize(
           device_runtimes.size()));
 
       // 3. For FusionKernelRuntime, we have a separate deserialize function
-      // to create the FusionExecutor objects.
+      // to create the KernelExecutor objects.
       device_runtimes.back()->deserialize(
           fb_fusion_kernel_runtime, args.getDeviceIndex());
 
diff --git a/csrc/runtime/fusion_executor_cache.h b/csrc/runtime/fusion_executor_cache.h
index 6ec0435bbd0..0a0e0520cf8 100644
--- a/csrc/runtime/fusion_executor_cache.h
+++ b/csrc/runtime/fusion_executor_cache.h
@@ -63,7 +63,7 @@ enum class PrimDataType;
 //! properties might: rank, DataType, contiguity, stride order, size (whether a
 //! dimension has size=1). When all of these properties are repeated, there is
 //! an opportunity to reduce the latency of producing a compiled Fusion and
-//! launch params (a FusionExecutor). Given inputs, we first compute an ID using
+//! launch params (a KernelExecutor). Given inputs, we first compute an ID using
 //! InputsIdLookup::lookupId that encodes tensor properties along with values of
 //! any integer-valued input scalars that might affect concretization. This ID
 //! is guaranteed not to conflict unless the inputs can be executed by the same
@@ -124,7 +124,7 @@ class FusionExecutorCache {
       int64_t fusion_id = 0,
       bool auto_schedule = true);
 
-  //! Execute fusion graph with given inputs, create `FusionExecutor` as needed
+  //! Execute fusion graph with given inputs, create `KernelExecutor` as needed
   //! Note this function also handles permutation & input update outside of
   //! codegen.
   //!
@@ -140,12 +140,6 @@ class FusionExecutorCache {
       std::optional<PrimDataType> forced_index_type = std::nullopt,
       std::optional<int8_t> selected_device = std::nullopt);
 
-  //! Converts inputs from IValue to KernelArgumentHolder, also handles cache
-  //! lookup
-  KernelArgumentHolder prepareInputs(
-      const at::ArrayRef<c10::IValue>& inputs,
-      std::optional<int8_t> selected_device = std::nullopt);
-
   //! query if there's a kernel ready to go for given inputs
   NVF_API bool isCompiled(
       const at::ArrayRef<c10::IValue>& inputs,
@@ -241,8 +235,14 @@ class FusionExecutorCache {
   void deserialize(const serde::FusionExecutorCache* buffer, int64_t fusion_id);
 
  private:
+  //! Converts inputs from IValue to KernelArgumentHolder, also handles cache
+  //! lookup
+  KernelArgumentHolder prepareInputs(
+      const at::ArrayRef<c10::IValue>& inputs,
+      std::optional<int8_t> selected_device = std::nullopt);
+
   //! evict cached short cut entry in `code_to_fe_lookup_` as well as cached
-  //! entry in `FusionExecutor`
+  //! entry in `KernelExecutor`
   void evictCache(size_t cache_id);
 
   //! The index type of forced_index_type is used to get a kernel
diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
index 7a2768e0cbf..8069ed3ee3a 100644
--- a/csrc/runtime/fusion_kernel_runtime.cpp
+++ b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -13,6 +13,8 @@
 #include <instrumentation.h>
 #include <ir/base_nodes.h>
 #include <preseg_passes/pre_segmenter.h>
+#include <python_frontend/fusion_definition.h>
+#include <python_frontend/translation.h>
 #include <runtime/executor.h>
 #include <runtime/fusion_cache_utils.h>
 #include <scheduler/heuristic.h>
@@ -125,7 +127,7 @@ FusionKernelRuntime::FusionKernelRuntime(
   //  would go directly to kernel launch.
   prepareRuntimeOrder(segmented_fusion_.get(), runtime_workspace_);
 
-  executors_ = std::vector<FusionExecutor>(segmented_fusion_->groups().size());
+  executors_ = std::vector<KernelExecutor>(segmented_fusion_->groups().size());
   if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
     segmented_fusion_->print();
   }
@@ -142,8 +144,8 @@ FusionKernelRuntime::FusionKernelRuntime(
 }
 
 void FusionKernelRuntime::evictCache(size_t input_id) {
-  for (auto& fe : executors_) {
-    fe.evictCache(input_id);
+  for (auto& ke : executors_) {
+    ke.evictCache(input_id);
   }
 }
 
@@ -159,8 +161,8 @@ flatbuffers::Offset<serde::FusionKernelRuntime> FusionKernelRuntime::serialize(
     flatbuffers::FlatBufferBuilder& builder) const {
   // See table definition for FusionKernelRuntime in serde/fusion_cache.fbs
 
-  // 1. Serialize FusionExecutor objects
-  std::vector<flatbuffers::Offset<serde::FusionExecutor>> executors_fb;
+  // 1. Serialize KernelExecutor objects
+  std::vector<flatbuffers::Offset<serde::KernelExecutor>> executors_fb;
   executors_fb.reserve(executors_.size());
   for (auto& executor : executors_) {
     executors_fb.push_back(executor.serialize(builder));
@@ -198,7 +200,7 @@ void FusionKernelRuntime::deserialize(
       runtime_id_ == buffer->runtime_id(),
       "Expected FusionKernelRuntime runtime_id to match serde runtime_id.");
 
-  // 1. Deserialize FusionExecutor objects
+  // 1. Deserialize KernelExecutor objects
   for (auto idx : c10::irange(buffer->executors()->size())) {
     auto sg = runtime_workspace_.group_run_order.at(idx);
 
@@ -298,6 +300,14 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
   for (int64_t run_order_id = 0; run_order_id < num_groups; ++run_order_id) {
     auto group_to_run = runtime_workspace_.group_run_order.at(run_order_id);
 
+    if (isDebugDumpEnabled(DebugDumpOption::PythonDefinitionSegments)) {
+      debug() << "Python definition for segmented group "
+              << group_to_run->groupId() << ":" << std::endl;
+      python_frontend::FusionDefinition fd(/*id=*/std::nullopt);
+      python_frontend::translate(group_to_run->getFusion(), &fd);
+      fd.print(debug());
+    }
+
     // TODO: index mode should be updated per segmented kernel
     // Prepare input vector
     KernelArgumentHolder group_runtime_inputs;
@@ -497,7 +507,7 @@ void FusionKernelRuntime::updateHeuristicsLaunchParams(
   }
 }
 
-const std::vector<FusionExecutor>& FusionKernelRuntime::executors() const {
+const std::vector<KernelExecutor>& FusionKernelRuntime::executors() const {
   return executors_;
 }
 
@@ -595,7 +605,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
   if (executor.groupId() < 0) {
     executor.setGroupId(group_id);
   }
-  auto outputs = executor.runFusion(args, launch_params, compile_params);
+  auto outputs = executor.run(args, launch_params, compile_params);
 
   return outputs;
 }
@@ -625,7 +635,7 @@ void FusionKernelRuntime::compileKernel(
   NVF_ERROR(
       heuristic_params->cparams.index_type.has_value(),
       "Kernel index type is not defined.");
-  executors_.at(group_id).compileFusion(
+  executors_.at(group_id).compile(
       fusion_to_run.get(),
       args,
       heuristic_params->lparams,
diff --git a/csrc/runtime/fusion_kernel_runtime.h b/csrc/runtime/fusion_kernel_runtime.h
index 7e34da833ce..5bf20883203 100644
--- a/csrc/runtime/fusion_kernel_runtime.h
+++ b/csrc/runtime/fusion_kernel_runtime.h
@@ -35,7 +35,7 @@ struct FusionKernelRuntime;
 //!
 //! Two types of instance can be created, one for complete/single-kernel fusion
 //!  and one for segmented/multi-kernel fusion.
-//! Conceptually this is a generalization of FusionExecutor that supports both
+//! Conceptually this is a generalization of KernelExecutor that supports both
 //!  single-kernel and multi-kernel caching/compiling/launching
 //!
 //! When serde_buffer argument is a nullptr, we run the
@@ -143,7 +143,7 @@ class FusionKernelRuntime {
   //!  for kernel launch for a new input dimension but same heuristics
   void updateHeuristicsLaunchParams(HeuristicParamsList* update_heuristics);
 
-  const std::vector<FusionExecutor>& executors() const;
+  const std::vector<KernelExecutor>& executors() const;
 
  private:
   //! Runs each fusion segment given arguments. The outputs for a fusion are
@@ -176,7 +176,7 @@ class FusionKernelRuntime {
  private:
   //! Entries indexed by groupID:
   //! Executors holding compiled kernels
-  std::vector<FusionExecutor> executors_;
+  std::vector<KernelExecutor> executors_;
 
   // A metadata copy of initial arguments used to contruct this
   // FusionKernelRuntime. Used during deserialization to schedule the fusion
diff --git a/csrc/scheduler/cache_policy_refiner.cpp b/csrc/scheduler/cache_policy_refiner.cpp
index 7e6eab7eb18..2159c93c969 100644
--- a/csrc/scheduler/cache_policy_refiner.cpp
+++ b/csrc/scheduler/cache_policy_refiner.cpp
@@ -58,6 +58,12 @@ bool isLoadGlobalToLocal(const Expr* expr) {
   if (ldst->opType() != LoadStoreOpType::Set) {
     return false;
   }
+  // It should not be necessary to check the output since it should be
+  // always a TensorView as long as the input is a TensorView, but
+  // just in case.
+  if (!ldst->in()->isA<TensorView>() || !ldst->out()->isA<TensorView>()) {
+    return false;
+  }
   if (ldst->in()->as<TensorView>()->getMemoryType() != MemoryType::Global) {
     return false;
   }
diff --git a/csrc/scheduler/compile_time_info.h b/csrc/scheduler/compile_time_info.h
index d413c99ae81..f7ec9d4a97f 100644
--- a/csrc/scheduler/compile_time_info.h
+++ b/csrc/scheduler/compile_time_info.h
@@ -234,7 +234,7 @@ class CompileTimeInfoBase : public PolymorphicBase {
 //! Compile-time information cache for `canSchedule` and `getHeuristics`
 //! interfaces. Each cache instance stores information that could be inferred at
 //! compile time in a fusion and therefore corresponds to an instance of
-//! FusionExecutor.
+//! KernelExecutor.
 class HeuristicDataCache {
   using EntryOwningPtr =
       std::unique_ptr<HeuristicCompileTime::CompileTimeInfoBase>;
diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h
index b97d8515011..ae7e7ff476e 100644
--- a/csrc/scheduler/matmul_heuristic.h
+++ b/csrc/scheduler/matmul_heuristic.h
@@ -138,9 +138,6 @@ class MatmulParams : public HeuristicParams {
     }
   } supported_vec_size;
 
-  //! Whether to rotate the ldmatrix out of the main loop
-  bool rotate_ldmatrix_out_of_main_loop = true;
-
   //! (Ampere+) Use cp.async to load operands.
   bool async_gmem_load_operands = false;
 
@@ -191,8 +188,6 @@ class MatmulParams : public HeuristicParams {
        << circular_buffer_options.toString() << "\n"
        << supported_vec_size.toString() << "\n"
        << nvfuser::toString(tile_sizes) << "\n"
-       << "Rotate ldmatrix out of main loop: "
-       << (rotate_ldmatrix_out_of_main_loop ? "true" : "false") << "\n"
        << "Async global mem load: "
        << (async_gmem_load_operands ? "true" : "false") << "\n"
        << "Indexing mode: "
@@ -216,9 +211,8 @@ class MatmulParams : public HeuristicParams {
 
   size_t hash() const override {
     // combine boolean flags for hashing
-    size_t attr_hash = (static_cast<size_t>(promote_prologue_smem_reuse) << 3) |
-        (static_cast<size_t>(use_smem_epilogue) << 2) |
-        (static_cast<size_t>(rotate_ldmatrix_out_of_main_loop) << 1) |
+    size_t attr_hash = (static_cast<size_t>(promote_prologue_smem_reuse) << 2) |
+        (static_cast<size_t>(use_smem_epilogue) << 1) |
         (static_cast<size_t>(async_gmem_load_operands));
 
     // combined hash
@@ -240,8 +234,6 @@ class MatmulParams : public HeuristicParams {
 
     return other->cparams == cparams && other->mma_macro == mma_macro &&
         other->async_gmem_load_operands == async_gmem_load_operands &&
-        other->rotate_ldmatrix_out_of_main_loop ==
-        rotate_ldmatrix_out_of_main_loop &&
         other->tile_sizes == tile_sizes &&
         other->circular_buffer_options == circular_buffer_options &&
         other->supported_vec_size == supported_vec_size &&
diff --git a/csrc/scheduler/matmul_heuristic_plugin.cpp b/csrc/scheduler/matmul_heuristic_plugin.cpp
index c1b7acf00c4..658ad2b07f7 100644
--- a/csrc/scheduler/matmul_heuristic_plugin.cpp
+++ b/csrc/scheduler/matmul_heuristic_plugin.cpp
@@ -146,8 +146,6 @@ void copyParamsToConfig(KernelConfig* config, const MatmulParams* mparams) {
                                                                            : 1;
   config->circular_buffer_smem_read =
       mparams->circular_buffer_options.circular_buffer_smem_read;
-  config->rotate_ldmatrix_out_of_main_loop =
-      mparams->rotate_ldmatrix_out_of_main_loop;
   config->problem.supported_vec_size.a = (uint8_t)mparams->supported_vec_size.a;
   config->problem.supported_vec_size.b = (uint8_t)mparams->supported_vec_size.b;
   config->problem.supported_vec_size.epilogue =
@@ -190,8 +188,6 @@ void copyConfigToParams(MatmulParams* mparams, const KernelConfig* config) {
   }
   mparams->circular_buffer_options.circular_buffer_smem_read =
       config->circular_buffer_smem_read;
-  mparams->rotate_ldmatrix_out_of_main_loop =
-      config->rotate_ldmatrix_out_of_main_loop;
 
   // enable circular buffering if configured
   mparams->circular_buffer_options.circular_buffer_smem_write =
diff --git a/csrc/scheduler/matmul_heuristic_plugin_api.h b/csrc/scheduler/matmul_heuristic_plugin_api.h
index 65094cb9d8c..224705530e5 100644
--- a/csrc/scheduler/matmul_heuristic_plugin_api.h
+++ b/csrc/scheduler/matmul_heuristic_plugin_api.h
@@ -77,7 +77,6 @@ struct KernelConfig {
   uint8_t grid_swizzle_factor = 0;
   uint8_t cta_order = 0;
   bool circular_buffer_smem_read = true;
-  bool rotate_ldmatrix_out_of_main_loop = true;
   bool async_gmem_load_operands = true;
 
  public:
diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp
index 799d519a8d2..86ad7e5144d 100644
--- a/csrc/scheduler/matmul_utils.cpp
+++ b/csrc/scheduler/matmul_utils.cpp
@@ -411,7 +411,7 @@ class VectorizationCalculator {
 
   //! To analyze vectorization, we need to know pointer alignment, sizes, and
   //! strides. SchedulerRuntimeInfo contains all this info about fusion
-  //! inputs, but fusion outputs are allocated by FusionExecutor so they are
+  //! inputs, but fusion outputs are allocated by KernelExecutor so they are
   //! absent from SchedulerRuntimeInfo.
   //!
   //! This function just extracts sizes and strides from runtime_info_ when
diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
index 99c60607063..c18e8d31ab3 100644
--- a/csrc/scheduler/reduction.cpp
+++ b/csrc/scheduler/reduction.cpp
@@ -402,6 +402,7 @@ std::unique_ptr<ReductionParams> inner2dReductionHeuristic(
             << std::endl;
     debug() << rparams->toString() << std::endl;
   }
+
   return rparams;
 }
 
diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
index 750cdb43597..260f5813be7 100644
--- a/csrc/scheduler/utils.cpp
+++ b/csrc/scheduler/utils.cpp
@@ -2565,7 +2565,7 @@ int64_t getSharedMemoryOverheadPerBlock(
     dtype_size = std::max(dtype_size, dataTypeSize(tv->getDataType().value()));
   }
   // for welford, three arrays of type nvfuser_index_t are used to store var,
-  // avg, and n. see FusionExecutor::computeLaunchParams. Here index type is
+  // avg, and n. see KernelExecutor::computeLaunchParams. Here index type is
   // assumed as int64_t
   int64_t welford_factor = ir_utils::hasOpsOfType<WelfordOp>(fusion) ? 3l : 1l;
   if (welford_factor == 3l) {
diff --git a/csrc/serde/Serde.md b/csrc/serde/Serde.md
index c0528950d6b..5f5bcbc18bb 100644
--- a/csrc/serde/Serde.md
+++ b/csrc/serde/Serde.md
@@ -27,19 +27,19 @@ The string's position in the cache becomes the input's cache id.
 This table represents a key-value pair in the unordered_map.
 
 ### FusionKernelRuntime
-* `FusionKernelRuntime` contains the segments for a Fusion. Each segment is represented by a `FusionExecutor` object.
+* `FusionKernelRuntime` contains the segments for a Fusion. Each segment is represented by a `KernelExecutor` object.
 
 #### Serialization:
 * We save a metadata copy of the arguments used to construct the `FusionKernelRuntime`. During deserialization,
-we call the constructor using the saved metadata arguments. Afterwards, we regenerate the `FusionExecutor` objects,
+we call the constructor using the saved metadata arguments. Afterwards, we regenerate the `KernelExecutor` objects,
 which are normally built by calling `compileFusionParallel` outside the constructor.
 
 ### KernelArgumentHolder
 * A collection of `PolymorphicValue` objects representing Scalars [`int, double, bool, complex`], Cpu Scalars, and Gpu Tensors.
 * **Note:** Pointer address of meta aten tensors is zero. The pointer address is used to specify vectorization during schedule.
 
-### FusionExecutor
-* `FusionExecutor` defines two data structs: `ExecutorEntry` and `GlobalBufferInfo`
+### KernelExecutor
+* `KernelExecutor` defines two data structs: `ExecutorEntry` and `GlobalBufferInfo`
 * `ExecutorEntry` contains information to launch a kernel for a set of input arguments. It contains the launch parameters,
 output-to-input alias map, and global buffer configurations.
 * `GlobalBufferInfo` specifies the buffer's tensor properties [`shape, stride, dtype`] and its corresponding TensorView.
diff --git a/csrc/serde/fusion_cache.fbs b/csrc/serde/fusion_cache.fbs
index 0cc499416b6..b21e4ea4f82 100644
--- a/csrc/serde/fusion_cache.fbs
+++ b/csrc/serde/fusion_cache.fbs
@@ -156,7 +156,7 @@ table Scalar {
 }
 
 // =====================================================================================
-// Tables for PolymorphicValue, ScalarCpu, TensorArg, KernelArgumentHolder used in FusionExecutor.
+// Tables for PolymorphicValue, ScalarCpu, TensorArg, KernelArgumentHolder used in KernelExecutor.
 
 // The ScalarCpu is represented by a fixed size array of raw bytes.
 table ScalarCpu {
@@ -188,7 +188,7 @@ table KernelArgumentHolder {
 
 //
 // =====================================================================================
-// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in FusionExecutor
+// Tables for LaunchParams, GlobalBufferInfo, ExecutorEntry, and TensorShape used in KernelExecutor
 
 // Data representing a tensor shape used in LaunchParam
 table TensorShape {
@@ -355,7 +355,7 @@ table CudaKernel {
 }
 
 // Each Fusion Executor maps to a lowered and compiled kernel.
-table FusionExecutor {
+table KernelExecutor {
   device_smem_limit: long;
   block_size_high_water_mark: long;
   maxrregcount_high_water_mark: long;
@@ -415,14 +415,14 @@ table SegmentedFusion {
 
 // Each FusionKernelRuntime represents a concretized, segmented Fusion.
 // We store the metadata for the original arguments to segment, schedule, and compile the Fusion at deserialization.
-// Each fusion segment is given a FusionExecutor.
+// Each fusion segment is given a KernelExecutor.
 // The unscheduled fusion is defined by traversing Trie in FusionCache.
 table FusionKernelRuntime {
   fusion_id: long;
   concrete_id: long;
   runtime_id: long;
   args: KernelArgumentHolder;
-  executors: [FusionExecutor];
+  executors: [KernelExecutor];
   segmented_fusion: SegmentedFusion;
 }
 
diff --git a/csrc/serde/polymorphic_value.h b/csrc/serde/polymorphic_value.h
index 6ca56a1c69a..5c0245303f9 100644
--- a/csrc/serde/polymorphic_value.h
+++ b/csrc/serde/polymorphic_value.h
@@ -21,7 +21,7 @@ namespace nvfuser::serde {
 //! PolymorphicValue table. This factory creates Bool, ComplexDouble, Double,
 //! Long, CPU Scalar, and CUDA Tensor objects. These arguments are stored in
 //! KernelArgumentHolder, which is used to schedule the fusion in
-//! FusionKernelRuntime and to run a kernel in FusionExecutor.
+//! FusionKernelRuntime and to run a kernel in KernelExecutor.
 class PolymorphicValueFactory
     : public Factory<PolymorphicValue, nvfuser::PolymorphicValue> {
  public:
diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
index 093715f92a8..06e15929aa9 100644
--- a/csrc/transform_replay.cpp
+++ b/csrc/transform_replay.cpp
@@ -52,25 +52,20 @@ class ReplaySelf : public ReplayTransformations {
         loop_ids_.find(mapped) != loop_ids_.end(),
         "Transform traversal failed, modified a node but it was not a loop node.");
 
-    // outer loop size
-    Val* remainder = ceilDiv(mapped->extent(), s->factor());
-
-    // Manually replay the split, following the output of the operations.
-    // This is so rfactor ops are replayed correctly.
-    IterDomain* ido = IterDomainBuilder(s->outer())
-                          .start(s->container()->zeroVal())
-                          .extent(s->innerSplit() ? remainder : s->factor())
-                          .build();
-
-    // inner IterDomain
-    IterDomain* idi = IterDomainBuilder(s->inner())
-                          .start(s->container()->zeroVal())
-                          .extent(s->innerSplit() ? s->factor() : remainder)
-                          .build();
-
-    // Generate the split node
-    IrBuilder::createInContainer<Split>(
-        s->container(), ido, idi, mapped, s->factor(), s->innerSplit());
+    NVF_ERROR(s->outer()->isRFactorProduct() == s->inner()->isRFactorProduct());
+
+    // Due to rfactor transformations, the iter types of the outputs
+    // may not follow the default rule. For example, even if the input
+    // is a reduction iter domain, the outputs may not. To replay the
+    // original split expression, the output iter types need to be
+    // specified explicitly.
+    auto [ido, idi] = IterDomain::split(
+        mapped,
+        s->factor(),
+        s->innerSplit(),
+        s->outer()->isRFactorProduct(),
+        s->outer()->getIterType(),
+        s->inner()->getIterType());
 
     // Remove mapped id from loop IDs
     loop_ids_.erase(mapped);
@@ -107,16 +102,7 @@ class ReplaySelf : public ReplayTransformations {
         id_inner_mapped,
         " however one or both are not loop nodes.");
 
-    Val* merged_id_size =
-        mul(id_outer_mapped->extent(), id_inner_mapped->extent());
-
-    IterDomain* merged_id = IterDomainBuilder(m->out())
-                                .start(m->container()->zeroVal())
-                                .extent(merged_id_size)
-                                .build();
-
-    IrBuilder::createInContainer<Merge>(
-        m->container(), merged_id, id_outer_mapped, id_inner_mapped);
+    IterDomain* merged_id = IterDomain::merge(id_outer_mapped, id_inner_mapped);
 
     // Remove inputs from the loop IDs
     loop_ids_.erase(id_outer_mapped);
diff --git a/csrc/transform_rfactor.cpp b/csrc/transform_rfactor.cpp
index 311bec23796..07799487eb0 100644
--- a/csrc/transform_rfactor.cpp
+++ b/csrc/transform_rfactor.cpp
@@ -108,9 +108,6 @@ class ReplayRFactor : public ReplayTransformations {
         loop_ids_.find(mapped) != loop_ids_.end(),
         "Transform traversal failed, modified a node but it was not a loop node.");
 
-    // outer loop size
-    Val* remainder = ceilDiv(mapped->extent(), s->factor());
-
     // Check if we need to mark the outputs as an logical domain meaning this
     // transformation must be present in replays otherwise it breaks the compute
     // definition of the fusion. Iter domains are actually not static, its the
@@ -119,32 +116,27 @@ class ReplayRFactor : public ReplayTransformations {
     bool static_logical_outputs = static_logical_ids_.count(s->outer()) ||
         static_logical_ids_.count(s->inner());
 
-    // Manually replay the split, making reduction = false and rfactor = true
-    // outer IterDomain
-    IterDomain* ido =
-        IterDomainBuilder(
-            s->container()->zeroVal(),
-            s->innerSplit() ? remainder : s->factor())
-            .iter_type(
-                rfactor_axes_.count(s->outer()) ? IterType::Reduction
-                                                : IterType::Iteration)
-            .is_rfactor_domain(static_logical_outputs)
-            .build();
+    // Let IterDomain::split determine the correct IterType, except
+    // when the output is a reduction domain but not part of the
+    // rfactored domains. If it isn't involved in the rfactor, it's no
+    // longer a redunction domain
+    std::optional<IterType> outer_iter_type;
+    if (s->outer()->isReduction() && !rfactor_dep_ids_.count(s->outer())) {
+      outer_iter_type = IterType::Iteration;
+    }
 
-    // inner IterDomain
-    IterDomain* idi =
-        IterDomainBuilder(
-            s->container()->zeroVal(),
-            s->innerSplit() ? s->factor() : remainder)
-            .iter_type(
-                rfactor_axes_.count(s->inner()) ? IterType::Reduction
-                                                : IterType::Iteration)
-            .is_rfactor_domain(static_logical_outputs)
-            .build();
+    std::optional<IterType> inner_iter_type;
+    if (s->inner()->isReduction() && !rfactor_dep_ids_.count(s->inner())) {
+      inner_iter_type = IterType::Iteration;
+    }
 
-    // Generate the split node
-    IrBuilder::createInContainer<Split>(
-        s->container(), ido, idi, mapped, s->factor(), s->innerSplit());
+    auto [ido, idi] = IterDomain::split(
+        mapped,
+        s->factor(),
+        s->innerSplit(),
+        static_logical_outputs,
+        outer_iter_type,
+        inner_iter_type);
 
     // Remove mapped id from loop IDs
     loop_ids_.erase(mapped);
@@ -182,23 +174,20 @@ class ReplayRFactor : public ReplayTransformations {
         id_inner_mapped,
         " however one or both are not loop nodes.");
 
-    Val* merged_id_size =
-        mul(id_outer_mapped->extent(), id_inner_mapped->extent());
-
-    bool is_bcast =
-        id_outer_mapped->isBroadcast() && id_inner_mapped->isBroadcast();
-    auto iter_type = rfactor_axes_.count(m->out())
-        ? IterType::Reduction
-        : (is_bcast ? IterType::Broadcast : IterType::Iteration);
-
-    IterDomain* merged_id =
-        IterDomainBuilder(m->container()->zeroVal(), merged_id_size)
-            .iter_type(iter_type)
-            .is_rfactor_domain(static_logical_ids_.count(m->out()))
-            .build();
+    // Let IterDomain::merge determine the correct IterType, except
+    // when the output is a reduction domain but not part of the
+    // rfactored domains. If it isn't involved in the rfactor, it's no
+    // longer a redunction domain
+    std::optional<IterType> iter_type;
+    if (m->out()->isReduction() && !rfactor_dep_ids_.count(m->out())) {
+      iter_type = IterType::Iteration;
+    }
 
-    IrBuilder::createInContainer<Merge>(
-        m->container(), merged_id, id_outer_mapped, id_inner_mapped);
+    IterDomain* merged_id = IterDomain::merge(
+        id_outer_mapped,
+        id_inner_mapped,
+        static_logical_ids_.count(m->out()),
+        iter_type);
 
     // Remove inputs from the loop IDs
     loop_ids_.erase(id_outer_mapped);
@@ -236,6 +225,9 @@ class ReplayRFactor : public ReplayTransformations {
   // The IterDomains in the original_domain that are being factored into the
   // first stage of the two stage reduction (the producer).
   std::unordered_set<IterDomain*> rfactor_axes_;
+  // All iter domains between the logical and the loop that the
+  // rfactor_axes_ depend on
+  std::unordered_set<IterDomain*> rfactor_dep_ids_;
   // Iter domains whose history cannot be changed as it would break rfactor
   // dependencies.
   std::unordered_set<IterDomain*> static_logical_ids_;
@@ -262,6 +254,14 @@ class ReplayRFactor : public ReplayTransformations {
         rfactor_axes_(std::move(rfactor_axes)),
         static_logical_ids_(std::move(static_logical_ids)),
         logical_domain_(original_domain->logical()) {
+    const auto all_dep_vals = DependencyCheck::getAllValsBetween(
+        {original_domain->maybeRoot().begin(),
+         original_domain->maybeRoot().end()},
+        {rfactor_axes_.begin(), rfactor_axes_.end()});
+
+    auto all_dep_ids = ir_utils::filterByType<IterDomain>(all_dep_vals);
+    rfactor_dep_ids_.insert(all_dep_ids.begin(), all_dep_ids.end());
+
     setErrorOnFailure(false);
   }
 };
diff --git a/csrc/utils.h b/csrc/utils.h
index f98d2e357a2..d831a6695a5 100644
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -112,23 +112,23 @@ class PolymorphicBase {
   // (checked in DEBUG builds)
   template <class T>
   T* as() {
-#ifdef NDEBUG
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
     auto downcast_ptr = static_cast<T*>(this);
 #else
     auto downcast_ptr = dynamic_cast<T*>(this);
     NVF_ERROR(downcast_ptr != nullptr);
-#endif
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
     return downcast_ptr;
   }
 
   template <class T>
   const T* as() const {
-#ifdef NDEBUG
+#if defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
     auto downcast_ptr = static_cast<const T*>(this);
 #else
     auto downcast_ptr = dynamic_cast<const T*>(this);
     NVF_ERROR(downcast_ptr != nullptr);
-#endif
+#endif // defined(NDEBUG) && !defined(NVFUSER_EXPLICIT_ERROR_CHECK)
     return downcast_ptr;
   }
 
diff --git a/doc/dev/python_scheduling/autotune_pointwise.py b/doc/dev/python_scheduling/autotune_pointwise.py
index 5034ebfd5c6..014ae8197a2 100644
--- a/doc/dev/python_scheduling/autotune_pointwise.py
+++ b/doc/dev/python_scheduling/autotune_pointwise.py
@@ -89,7 +89,7 @@ def inner_fn():
         if config is not None:
             vectorization_factor, unroll_factor = config
             schedule_params.vectorization_factor = vectorization_factor
-            schedule_params.unroll_factor = unroll_factor
+            schedule_params.unroll_factor_inner = unroll_factor
 
         # Schedule fusion
         fd.sched.schedule()
diff --git a/examples/sinh_extension/main.cpp b/examples/sinh_extension/main.cpp
index e44086dbbe3..c6dd6b7fe01 100644
--- a/examples/sinh_extension/main.cpp
+++ b/examples/sinh_extension/main.cpp
@@ -34,9 +34,9 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) {
   auto heuristic_params =
       SchedulerEntry::scheduleWith(&fusion, SchedulerType::PointWise, {input});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input}, heuristic_params->lparams);
-  auto outputs = fe.runFusion({input}, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {input}, heuristic_params->lparams);
+  auto outputs = ke.run({input}, heuristic_params->lparams);
 
   return outputs[0];
 }
diff --git a/examples/sinh_libtorch/main.cpp b/examples/sinh_libtorch/main.cpp
index 8c83f6d0e23..12f58d08c33 100644
--- a/examples/sinh_libtorch/main.cpp
+++ b/examples/sinh_libtorch/main.cpp
@@ -31,9 +31,9 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) {
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::PointWise, {input});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input}, heuristic_params->lparams);
-  auto outputs = fe.runFusion({input}, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {input}, heuristic_params->lparams);
+  auto outputs = ke.run({input}, heuristic_params->lparams);
 
   return outputs[0];
 }
diff --git a/nvfuser/contrib/nn/normalization.py b/nvfuser/contrib/nn/normalization.py
index 4d05eb538ec..c01faf86cdb 100644
--- a/nvfuser/contrib/nn/normalization.py
+++ b/nvfuser/contrib/nn/normalization.py
@@ -401,12 +401,6 @@ def forward(
                 tv_running_mean = partially_contig_tensor(fd, running_mean)
                 tv_running_var = partially_contig_tensor(fd, running_var)
                 inputs.extend([running_mean, running_var])
-                if running_mean.dtype in [torch.half, torch.bfloat16]:
-                    tv_running_mean = fd.ops.cast(
-                        tv_running_mean, nvfuser.DataType.Float
-                    )
-                if running_var.dtype in [torch.half, torch.bfloat16]:
-                    tv_running_var = fd.ops.cast(tv_running_var, nvfuser.DataType.Float)
 
             s_momentum = fd.define_scalar(nvfuser.DataType.Double)
             s_eps = fd.define_scalar(nvfuser.DataType.Double)
diff --git a/setup.py b/setup.py
index 55dcc041f92..4f3fd6c28fa 100644
--- a/setup.py
+++ b/setup.py
@@ -79,6 +79,7 @@
 BUILD_WITH_ASAN = False
 BUILD_WITHOUT_DISTRIBUTED = False
 OVERWRITE_VERSION = False
+EXPLICIT_ERROR_CHECK = False
 VERSION_TAG = None
 BUILD_TYPE = "Release"
 WHEEL_NAME = "nvfuser"
@@ -107,6 +108,9 @@
     if arg == "--build-with-ucc":
         BUILD_WITH_UCC = True
         continue
+    if arg == "--explicit-error-check":
+        EXPLICIT_ERROR_CHECK = True
+        continue
     if arg == "--build-with-asan":
         BUILD_WITH_ASAN = True
         continue
@@ -330,6 +334,8 @@ def cmake():
     ]
     if BUILD_WITH_UCC:
         cmd_str.append("-DNVFUSER_STANDALONE_BUILD_WITH_UCC=ON")
+    if EXPLICIT_ERROR_CHECK:
+        cmd_str.append("-DNVFUSER_EXPLICIT_ERROR_CHECK=ON")
     if not NO_NINJA:
         cmd_str.append("-G")
         cmd_str.append("Ninja")
diff --git a/tests/cpp/test_alias.cpp b/tests/cpp/test_alias.cpp
index 68337688656..7c1b4df46af 100644
--- a/tests/cpp/test_alias.cpp
+++ b/tests/cpp/test_alias.cpp
@@ -50,10 +50,11 @@ TEST_F(AliasTest, View) {
   TensorView* out = reshape(in, in_shape, out_shape);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
 
@@ -61,7 +62,8 @@ TEST_F(AliasTest, View) {
   EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr());
 
   // Verify output values.
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, View_AliasForSameLayout) {
@@ -80,13 +82,15 @@ TEST_F(AliasTest, View_AliasForSameLayout) {
       {in->axis(1), in->axis(2), in->axis(0)}, {true, false, false});
   out->setAllocationDomain({out->axis(1), out->axis(0)}, false);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({60}).cuda().as_strided({2, 3, 4}, {2, 20, 5});
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensor.is_alias_of(in_tensor));
 }
@@ -105,12 +109,14 @@ TEST_F(AliasTest, View_AliasForCompliantLayout) {
 
   out->setAllocationDomain({out->axis(0), out->axis(1)}, {false, false});
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 4}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensor.is_alias_of(in_tensor));
 }
@@ -131,10 +137,11 @@ TEST_F(AliasTest, View_NoAliasForIncompliantLayout) {
   // alias.
   out->setAllocationDomain({out->axis(1), out->axis(0)}, true);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
 
@@ -142,7 +149,8 @@ TEST_F(AliasTest, View_NoAliasForIncompliantLayout) {
   EXPECT_FALSE(out_tensor.is_alias_of(in_tensor));
 
   // Verify output values.
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, ViewPermute) {
@@ -158,10 +166,11 @@ TEST_F(AliasTest, ViewPermute) {
   out = permute(out, {1, 0});
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
 
@@ -169,7 +178,8 @@ TEST_F(AliasTest, ViewPermute) {
   EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr());
 
   // Verify output values.
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, DuplicateOutputs) {
@@ -185,10 +195,11 @@ TEST_F(AliasTest, DuplicateOutputs) {
   fusion->addOutput(out);
   fusion->addOutput(out); // duplicated outputs
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn(in_shape, at::dtype(at::kFloat).device(at::kCUDA, 0));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 2);
   at::Tensor out_tensor_0 = out_tensors[0];
   at::Tensor out_tensor_1 = out_tensors[1];
@@ -196,12 +207,13 @@ TEST_F(AliasTest, DuplicateOutputs) {
   // Verify aliasing among duplicated outputs
   EXPECT_TRUE(out_tensor_0.is_alias_of(out_tensor_1));
   // Verify no segmentation
-  EXPECT_FALSE(fec.getMostRecentKernelRuntime()->isSegmented())
+  EXPECT_FALSE(executor_cache.getMostRecentKernelRuntime()->isSegmented())
       << "segmentation is not supposed to happen";
 
   at::Tensor expected_out_tensor = in_tensor.add(3.141);
   // Verify output values.
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, SliceToSizeOne_Issue1353) {
@@ -213,14 +225,14 @@ TEST_F(AliasTest, SliceToSizeOne_Issue1353) {
   TensorView* out = slice(in, {0, 0, 0}, {4, 6, 1});
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({4, 6, 7}).cuda();
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
   EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr());
   EXPECT_THAT(out_tensor.strides(), ElementsAre(42, 7, _));
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       {in_tensor.slice(/*dim=*/2, /*start=*/c10::nullopt, /*end=*/1)},
       {in_tensor},
       __LINE__,
@@ -236,14 +248,14 @@ TEST_F(AliasTest, SliceRightOfBroadcast) {
   TensorView* out = slice(in, {0, 0, 0}, {4, 1, 5});
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({4, 1, 7}).cuda();
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
   EXPECT_EQ(in_tensor.data_ptr(), out_tensor.data_ptr());
   EXPECT_THAT(out_tensor.strides(), ElementsAre(7, _, 1));
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       {in_tensor.slice(/*dim=*/2, /*start=*/c10::nullopt, /*end=*/5)},
       {in_tensor},
       __LINE__,
@@ -274,9 +286,10 @@ TEST_F(AliasTest, SliceViewPermute) {
     fusion->addOutput(split);
   }
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({batches, seq_length, features * 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   EXPECT_EQ(out_tensors.size(), 3);
 
   for (const auto& out_tensor : out_tensors) {
@@ -292,7 +305,7 @@ TEST_F(AliasTest, SliceViewPermute) {
   }
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       out_tensors,
       {in_tensor},
       expected_out_tensors,
@@ -317,11 +330,13 @@ TEST_F(AliasTest, DuplicateOutputsSegmentedFusion) {
   fusion->addOutput(out);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn(in_shape, at::dtype(at::kFloat).device(at::kCUDA, 0));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   // Verify aliasing among duplicated outputs
   EXPECT_TRUE(out_tensors[0].is_alias_of(out_tensors[1]));
@@ -329,22 +344,26 @@ TEST_F(AliasTest, DuplicateOutputsSegmentedFusion) {
 
   // Verify segmentation
   EXPECT_EQ(
-      fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 2)
+      executor_cache.getMostRecentKernelRuntime()
+          ->fusionSegments()
+          ->groups()
+          .size(),
+      2)
       << "segmentation didn't happen as expected";
 }
 
 namespace {
 
 // Returns the only executor in the most recent runtime.
-const FusionExecutor& onlyExecutorInMostRecentRuntime(
-    const FusionExecutorCache& fec) {
-  const std::vector<FusionExecutor>& executors =
-      fec.getMostRecentKernelRuntime()->executors();
+const KernelExecutor& onlyExecutorInMostRecentRuntime(
+    const FusionExecutorCache& executor_cache) {
+  const std::vector<KernelExecutor>& executors =
+      executor_cache.getMostRecentKernelRuntime()->executors();
   EXPECT_EQ(executors.size(), 1);
   return executors.front();
 }
 
-bool storesToOutput(const FusionExecutor& executor, const int64_t out_index) {
+bool storesToOutput(const KernelExecutor& executor, const int64_t out_index) {
   // Get the variable name from the `kir::Kernel` not the input fusion, because
   // they are not always the same.
   std::string var_name =
@@ -371,10 +390,12 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) {
   fusion->addOutput(broadcast_out);
   fusion->addOutput(add_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 
@@ -385,7 +406,7 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) {
   // that stores only to the output of the add.
   //
   // - broadcast & expand. This segment is meta-op only.
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
@@ -394,17 +415,17 @@ TEST_F(AliasTest, NotAllOutputsAlias_Pointwise) {
 
   for (SegmentedGroup* group : runtime->fusionSegments()->groups()) {
     if (group->schedulerType() == SchedulerType::PointWise) {
-      const FusionExecutor& fe = runtime->executors().at(group->groupId());
+      const KernelExecutor& ke = runtime->executors().at(group->groupId());
       int num_stores = 0;
       for (auto i : c10::irange(group->outputs().size())) {
-        if (storesToOutput(fe, i)) {
+        if (storesToOutput(ke, i)) {
           num_stores++;
         }
       }
       EXPECT_EQ(num_stores, 1)
           << "The generated CUDA kernel is expected to store data to one output:"
           << std::endl
-          << fe.kernelString();
+          << ke.kernelString();
     }
   }
 }
@@ -427,13 +448,15 @@ TEST_F(AliasTest, NotAllOutputsAlias_Reduction) {
   fusion->addOutput(view_out);
   fusion->addOutput(permute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({16 * 12 * 128 * 192})
           .cuda()
           .as_strided({16, 12, 128, 192}, {128 * 12 * 192, 192, 12 * 192, 1});
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor));
   EXPECT_TRUE(out_tensors[2].is_alias_of(in_tensor));
@@ -452,15 +475,17 @@ TEST_F(AliasTest, Issue1452) {
   fusion->addOutput(set_out);
   fusion->addOutput(add_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({1024, 1024}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   at::Tensor set_out_tensor = out_tensors[0];
   EXPECT_TRUE(set_out_tensor.is_alias_of(in_tensor));
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
@@ -469,17 +494,17 @@ TEST_F(AliasTest, Issue1452) {
 
   for (SegmentedGroup* group : runtime->fusionSegments()->groups()) {
     if (group->schedulerType() == SchedulerType::PointWise) {
-      const FusionExecutor& fe = runtime->executors().at(group->groupId());
+      const KernelExecutor& ke = runtime->executors().at(group->groupId());
       int num_stores = 0;
       for (auto i : c10::irange(group->outputs().size())) {
-        if (storesToOutput(fe, i)) {
+        if (storesToOutput(ke, i)) {
           num_stores++;
         }
       }
       EXPECT_EQ(num_stores, 1)
           << "The generated CUDA kernel is expected to store data to one output:"
           << std::endl
-          << fe.kernelString();
+          << ke.kernelString();
     }
   }
 }
@@ -495,20 +520,22 @@ TEST_F(AliasTest, AliasOutputBeforeNonAliasOutput) {
   fusion->addOutput(slice_out);
   fusion->addOutput(add_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
 
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   at::Tensor slice_out_tensor = out_tensors[0];
   EXPECT_TRUE(slice_out_tensor.is_alias_of(in_tensor));
 
-  const FusionExecutor& fe = onlyExecutorInMostRecentRuntime(fec);
-  EXPECT_FALSE(storesToOutput(fe, /*out_index=*/0))
+  const KernelExecutor& ke = onlyExecutorInMostRecentRuntime(executor_cache);
+  EXPECT_FALSE(storesToOutput(ke, /*out_index=*/0))
       << "The generated CUDA kernel shouldn't store data to output 0:"
       << std::endl
-      << fe.kernelString();
+      << ke.kernelString();
 }
 
 TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) {
@@ -523,9 +550,10 @@ TEST_F(AliasTest, Set_NoAliasForIncompatibleLayout) {
   // I intentionally set the allocation order to be different to block aliasing.
   out->setAllocationDomain({out->axis(1), out->axis(2), out->axis(0)}, true);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
 
@@ -549,10 +577,11 @@ TEST_F(AliasTest, DuplicateOutputsComplex) {
   // duplicated output
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
 
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 4);
 
   // Verify aliases among outputs.
@@ -561,7 +590,8 @@ TEST_F(AliasTest, DuplicateOutputsComplex) {
   EXPECT_TRUE(out_tensors[0].is_alias_of(out_tensors[3]));
 
   // Verify output values.
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
 // test verifying that duplicated input is not allowed in nvfuser
@@ -593,10 +623,12 @@ TEST_F(AliasTest, AliasInSegment) {
   fusion->addOutput(add_out);
   fusion->addOutput(permute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor));
 }
@@ -617,17 +649,20 @@ TEST_F(AliasTest, TrivialInputForwarding) {
   at::Tensor t0 = at::randn({10, 4}).cuda();
   at::Tensor t1 = at::randn({10, 4}).cuda();
 
-  FusionExecutorCache fec(std::move(fusion));
-  std::vector<at::Tensor> cg_outputs = fec.runFusionWithInputs({t0, t1});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  std::vector<at::Tensor> cg_outputs =
+      executor_cache.runFusionWithInputs({t0, t1});
 
   EXPECT_EQ(cg_outputs[0].data_ptr(), t0.data_ptr());
-  testValidate(fec.fusion(), cg_outputs, {t0, t1}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0, t1}, __LINE__, __FILE__);
 
   // Second run to ensure cache hit handles trivial forwarding properly
-  EXPECT_TRUE(fec.isCompiled({t0, t1}));
-  auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
+  EXPECT_TRUE(executor_cache.isCompiled({t0, t1}));
+  auto cg_outputs2 = executor_cache.runFusionWithInputs({t0, t1});
   EXPECT_EQ(cg_outputs2[0].data_ptr(), t0.data_ptr());
-  testValidate(fec.fusion(), cg_outputs2, {t0, t1}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), cg_outputs2, {t0, t1}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, TrivialInputForwarding_ScalarTensor) {
@@ -640,16 +675,16 @@ TEST_F(AliasTest, TrivialInputForwarding_ScalarTensor) {
 
   at::Tensor t0 = at::randn({}).cuda();
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto cg_outputs = fec.runFusionWithInputs({t0});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
   EXPECT_EQ(cg_outputs[0].data_ptr(), t0.data_ptr());
-  testValidate(fec.fusion(), cg_outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__);
 
   // Second run to ensure cache hit handles trivial forwarding properly
-  EXPECT_TRUE(fec.isCompiled({t0}));
-  auto cg_outputs2 = fec.runFusionWithInputs({t0});
+  EXPECT_TRUE(executor_cache.isCompiled({t0}));
+  auto cg_outputs2 = executor_cache.runFusionWithInputs({t0});
   EXPECT_EQ(cg_outputs2[0].data_ptr(), t0.data_ptr());
-  testValidate(fec.fusion(), cg_outputs2, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), cg_outputs2, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, OutputAliasesAnotherOutput) {
@@ -665,10 +700,12 @@ TEST_F(AliasTest, OutputAliasesAnotherOutput) {
   fusion->addOutput(reshape_out);
   fusion->addOutput(permute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   ASSERT_EQ(out_tensors.size(), 2);
   EXPECT_TRUE(out_tensors[1].is_alias_of(out_tensors[0]));
@@ -689,12 +726,14 @@ TEST_F(AliasTest, OutputNotAliasedByAnotherOutputShouldNotBeSegmented) {
   fusion->addOutput(reshape_out);
   fusion->addOutput(mul_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_FALSE(runtime->isSegmented());
 }
 
@@ -716,10 +755,12 @@ TEST_F(AliasTest, ManyAliasesBetweenOutputs) {
   fusion->addOutput(permute_out);
   fusion->addOutput(add_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
   ASSERT_EQ(out_tensors.size(), 4);
   at::Tensor slice_out_tensor = out_tensors[0];
   at::Tensor reshape_out_tensor = out_tensors[1];
@@ -732,7 +773,7 @@ TEST_F(AliasTest, ManyAliasesBetweenOutputs) {
 
   // Segment 1: in -> add_out
   // Segment 2: add_out -> its output aliases
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2);
 }
 
@@ -750,12 +791,14 @@ TEST_F(AliasTest, DoNotOverSegment_Straightline) {
   fusion->addOutput(permute_out);
   fusion->addOutput(mul_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_FALSE(runtime->isSegmented());
 
   // permute_out should be recognized as an alias of add_out. However, the
@@ -781,12 +824,14 @@ TEST_F(AliasTest, DoNotOverSegment_WithForks) {
   fusion->addOutput(out1);
   fusion->addOutput(out2);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::PointWise)).Times(1));
@@ -804,10 +849,11 @@ TEST_F(AliasTest, Broadcast) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr());
 }
@@ -826,10 +872,11 @@ TEST_F(AliasTest, Expand) {
        broadcast_tv->axis(2)->extent()});
   fusion->addOutput(expanded_tv);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr());
 }
@@ -848,10 +895,11 @@ TEST_F(AliasTest, MergeTwoExpandedBroadcasts) {
   TensorView* out = reshape(in, {4, 5, 6}, {20, -1});
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({1}).cuda().as_strided({4, 5, 6}, {0, 0, 0});
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
   // TODO(#1126): This should become an alias when #1126 is fixed.
   // EXPECT_TRUE(out_tensor.is_alias_of(in_tensor));
@@ -872,11 +920,12 @@ TEST_F(AliasTest, MergeBroadcastsBetweenConcretes) {
   out = reshape(out, {2, 15, 7}, {30, 7});
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({2 * 7}).cuda().as_strided({2, 3, 5, 7}, {7, 0, 0, 1});
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(AliasTest, Squeeze) {
@@ -888,10 +937,11 @@ TEST_F(AliasTest, Squeeze) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 1, 3}).cuda();
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr());
 }
@@ -906,10 +956,12 @@ TEST_F(AliasTest, SourceIsBothInputAndOutput) {
   fusion->addOutput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(in_tensor.data_ptr(), out_tensors[0].data_ptr());
   EXPECT_EQ(in_tensor.data_ptr(), out_tensors[1].data_ptr());
@@ -929,12 +981,13 @@ TEST_F(AliasTest, SegmentBoundary) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
@@ -955,12 +1008,12 @@ TEST_F(AliasTest, ReuseBuffer) {
   auto tensor = at::randn({10}, options);
   auto expected_tensor = tensor + 1.0;
 
-  FusionExecutorCache fec(std::move(fusion));
-  fec.runFusionWithInputs({tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  executor_cache.runFusionWithInputs({tensor});
   EXPECT_TRUE(tensor.allclose(expected_tensor));
 }
 
-TEST_F(AliasTest, ReuseBuffer_FusionExecutor) {
+TEST_F(AliasTest, ReuseBuffer_KernelExecutor) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   TensorView* in = makeContigTensor(1);
@@ -972,9 +1025,9 @@ TEST_F(AliasTest, ReuseBuffer_FusionExecutor) {
   auto tensor = at::randn({10}, options);
   auto expected_tensor = tensor + 1.0;
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {tensor});
-  fe.runFusion({tensor}, {tensor});
+  KernelExecutor ke;
+  ke.compile(&fusion, {tensor});
+  ke.run({tensor}, {tensor});
   EXPECT_TRUE(tensor.allclose(expected_tensor));
 }
 
@@ -1010,18 +1063,27 @@ TEST_F(AliasTest, ReuseBuffer_AliasAcrossSegments) {
   at::Tensor t1 = at::randn({65}, options);
   at::Tensor t2 = at::randn({128, 65}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   // Make a copy of `t0` because `t0` will be in-place updated.
   at::Tensor original_t0 = t0.clone();
-  std::vector<at::Tensor> outputs = fec.runFusionWithInputs({t0, t1, t2});
+  std::vector<at::Tensor> outputs =
+      executor_cache.runFusionWithInputs({t0, t1, t2});
   testValidate(
-      fec.fusion(), outputs, {original_t0, t1, t2}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      outputs,
+      {original_t0, t1, t2},
+      __LINE__,
+      __FILE__);
 
   // https://github.com/NVIDIA/Fuser/pull/2999 will cause 3 segments instead of
   // the optimal 2 segments. Change back to 2 segments once
   // https://github.com/NVIDIA/Fuser/issues/3251 is resolved.
   EXPECT_EQ(
-      fec.getMostRecentKernelRuntime()->fusionSegments()->groups().size(), 3)
+      executor_cache.getMostRecentKernelRuntime()
+          ->fusionSegments()
+          ->groups()
+          .size(),
+      3)
       << "segmentation didn't happen as expected";
 
   auto t3 = original_t0.add(1.0);
@@ -1055,16 +1117,17 @@ TEST_F(AliasTest, AliasOnlyKernelsAreNotLaunched) {
   fusion->addOutput(add_out);
   fusion->addOutput(permute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::dtype(at::kFloat).device(at::kCUDA);
   at::Tensor in_tensor = at::randn({2, 3}, options);
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
   if (ProfilerState::Running == FusionProfiler::state()) {
     FusionProfiler::stop();
   }
   ProfilerOptionsGuard::getCurOptions().unset(ProfilerOption::Enable);
 
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   const FusionProfile& profile = FusionProfiler::profile();
   // Expect a kernel launched for one of the two segments but not the
@@ -1094,13 +1157,14 @@ TEST_F(AliasTest, PerfDebugVerboseWhenSomeKernelsNotLaunched) {
   fusion->addOutput(add_out);
   fusion->addOutput(permute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::dtype(at::kFloat).device(at::kCUDA);
   at::Tensor in_tensor = at::randn({2, 3}, options);
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
@@ -1127,10 +1191,10 @@ TEST_F(AliasTest, NoKernelsAreLaunched) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::dtype(at::kFloat).device(at::kCUDA);
   at::Tensor in_tensor = at::randn({2, 3}, options);
-  fec.runFusionWithInputs({in_tensor});
+  executor_cache.runFusionWithInputs({in_tensor});
 
   if (ProfilerState::Running == FusionProfiler::state()) {
     FusionProfiler::stop();
@@ -1146,8 +1210,8 @@ TEST_F(AliasTest, NoKernelsAreLaunched) {
 }
 
 // While most use cases go through FusionExecutorCache, nvFuser also supports
-// evaluating an alias via FusionExecutor.
-TEST_F(AliasTest, FusionExecutor) {
+// evaluating an alias via KernelExecutor.
+TEST_F(AliasTest, KernelExecutor) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1160,15 +1224,15 @@ TEST_F(AliasTest, FusionExecutor) {
   AliasAnalysisResult analysis = findAliases(&fusion);
   EXPECT_EQ(analysis.getRoot(out), in);
 
-  // Mark them alias so FusionExecutor::runFusion expression-evaluates the
+  // Mark them alias so KernelExecutor::runFusion expression-evaluates the
   // output on the host instead of launching a CUDA kernel.
   fusion.aliasOutputToInput(out, in, AllocationType::Evaluate);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({10, 10}, options);
-  fe.compileFusion(&fusion, {in_tensor});
-  at::Tensor out_tensor = fe.runFusion({in_tensor})[0];
+  ke.compile(&fusion, {in_tensor});
+  at::Tensor out_tensor = ke.run({in_tensor})[0];
   EXPECT_EQ(out_tensor.data_ptr(), in_tensor.data_ptr());
 }
 
@@ -1182,13 +1246,13 @@ TEST_F(AliasTest, InplaceUpdate) {
   fusion->addInput(out);
   fusion->aliasOutputToInput(out, in, AllocationType::ReuseBuffer);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
   at::Tensor out_tensor = in_tensor + 1;
-  fec.runFusionWithInputs({in_tensor, out_tensor});
+  executor_cache.runFusionWithInputs({in_tensor, out_tensor});
   EXPECT_TRUE(out_tensor.equal(in_tensor));
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(HeuristicIs(SchedulerType::PointWise)));
@@ -1209,10 +1273,12 @@ TEST_F(AliasTest, Bookend_SegmentSetPreservesAllocation) {
   permute_out->setAllocationDomain(
       {permute_out->axis(0), permute_out->axis(1)}, true);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({3, 2}).cuda().transpose(0, 1);
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   at::Tensor permute_out_tensor = out_tensors[0];
   EXPECT_TRUE(permute_out_tensor.is_alias_of(in_tensor));
@@ -1230,15 +1296,17 @@ TEST_F(AliasTest, Bookend_InputsAndOutputs) {
   fusion->addOutput(permute_out);
   fusion->addOutput(compute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   at::Tensor permute_out_tensor = out_tensors[0];
   EXPECT_TRUE(permute_out_tensor.is_alias_of(in_tensor));
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   // MarkAliasesPrepare adds a `segment_set` between `in` and `permute`, which
   // leads to three segments:
   // 1. segment_set`, a no-op segment,
@@ -1269,12 +1337,14 @@ TEST_F(AliasTest, Bookend_IntermediateTensors) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
@@ -1303,15 +1373,17 @@ TEST_F(AliasTest, Bookend_AliasesOfSameTensor) {
   fusion->addOutput(out1);
   fusion->addOutput(out2);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[1].data_ptr());
   EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[2].data_ptr());
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::PointWise)).Times(1));
@@ -1338,14 +1410,16 @@ TEST_F(AliasTest, Bookend_ReuseSegmentSet) {
   fusion->addOutput(out0);
   fusion->addOutput(out1);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensors[0].data_ptr(), out_tensors[1].data_ptr());
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
@@ -1384,13 +1458,15 @@ TEST_F(AliasTest, QKVSplitBackprop) {
   fusion->addOutput(view_out);
   fusion->addOutput(permute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   std::vector<c10::IValue> in_tensors;
   for (int i = 0; i < 3; i++) {
     in_tensors.push_back(at::randn({b, s, h * f}).cuda());
   }
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs(in_tensors);
-  testValidate(fec.fusion(), out_tensors, in_tensors, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs(in_tensors);
+  testValidate(
+      executor_cache.fusion(), out_tensors, in_tensors, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[2].is_alias_of(out_tensors[1]));
 }
@@ -1419,12 +1495,12 @@ TEST_F(AliasTest, Bookend_Issue2375) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn(input_shape, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({t0});
-  testValidate(fec.fusion(), out_tensors, {t0}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({t0});
+  testValidate(executor_cache.fusion(), out_tensors, {t0}, __LINE__, __FILE__);
 
   EXPECT_THAT(
-      fec.getMostRecentKernelRuntime()->fusionSegments()->groups(),
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(),
       UnorderedElementsAre(
           HeuristicIs(SchedulerType::NoOp),
           HeuristicIs(SchedulerType::InnerPersistent)));
@@ -1458,10 +1534,15 @@ TEST_F(AliasTest, Issue2664) {
   auto t2 = at::randn({}, options);
   auto aten_out = (t2 + 1.0) * t1;
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({t1, t2});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({t1, t2});
   testValidate(
-      fec.fusion(), out_tensors, {t1, t2}, {aten_out}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      out_tensors,
+      {t1, t2},
+      {aten_out},
+      __LINE__,
+      __FILE__);
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp
index ef72282f1b2..3b50a399b0b 100644
--- a/tests/cpp/test_alias_analysis.cpp
+++ b/tests/cpp/test_alias_analysis.cpp
@@ -182,11 +182,11 @@ TEST_F(AliasAnalysisTest, View_ForwardExpandedBroadcast) {
   EXPECT_EQ(analysis.getRoot(out), in);
 
   // Verify the last dimension isn't expanded physically.
-  FusionExecutor fe;
+  KernelExecutor ke;
   at::Tensor in_tensor =
       at::randn({4, 5}).cuda().as_strided({4, 5, 6}, {5, 1, 0});
-  fe.compileFusion(&fusion, {in_tensor});
-  at::Tensor out_tensor = fe.runFusion({in_tensor})[0];
+  ke.compile(&fusion, {in_tensor});
+  at::Tensor out_tensor = ke.run({in_tensor})[0];
 
   EXPECT_THAT(out_tensor.strides(), ElementsAre(1, 0));
 }
diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index 167374c3799..55ac0ee99d4 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -29,8 +29,7 @@ using ::testing::ElementsAre;
 // A global->shared->global copy kernel, shared memory allocated transposed to
 // avoid bank conflict.
 TEST_F(AllocationDomainTest, TransposedIntermediate) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigConcreteTensor({32, 32});
@@ -58,17 +57,16 @@ TEST_F(AllocationDomainTest, TransposedIntermediate) {
 
   at::Tensor t0 = at::randn({32, 32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 // A global->global copy kernel converting NCHW memory format into NHWC, with a
 // 4d allocation domain in output.
 TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -96,10 +94,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
 
   at::Tensor t0 = at::randn({n, c, h, w}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -109,8 +107,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC4d) {
 // A global->global copy kernel converting NCHW memory format into NHWC, with a
 // 1d allocation domain in output.
 TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -135,10 +132,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
 
   at::Tensor t0 = at::randn({n, c, h, w}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -148,8 +145,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC1d) {
 // A global->global copy kernel converting NCHW memory format into NHWC, with a
 // 2d allocation domain in output.
 TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -175,10 +171,10 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
 
   at::Tensor t0 = at::randn({n, c, h, w}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -188,8 +184,7 @@ TEST_F(AllocationDomainTest, NCHW4d_To_NHWC2d) {
 // Reshape and transpose a 3d tensor into an NHWC tensor with a 3d allocation
 // domain in fusion output.
 TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n1 = 31, n2 = 29, h = 64, w = 104, c = 21;
@@ -222,10 +217,10 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
 
   at::Tensor t0 = at::randn({n1, n2, h * w * c}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -242,8 +237,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC3d) {
 // output. The allocation domain is on both the producer and the consumer side
 // of the rFactor domain.
 TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n1 = 31, n2 = 29, h = 64, w = 104, c = 21;
@@ -282,10 +276,10 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
 
   at::Tensor t0 = at::randn({n1, n2, c * h * w}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -301,8 +295,7 @@ TEST_F(AllocationDomainTest, Tensor3d_To_NHWC4d_FwdBwd) {
 // A global->global copy kernel where both inputs and outputs are NHWC memory
 // format
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -338,15 +331,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Stride mismatch with contiguity info")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -356,8 +349,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d) {
 // A global->global copy kernel where both inputs are NHWC memory format. The
 // allocation domain view the input as a 1d tensor.
 TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -397,15 +389,15 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "splitting one dimension into discontiguous dimensions is not allowed in allocation domain")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -415,8 +407,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC4d) {
 // A global->global copy kernel where both inputs are NHWC memory format. The
 // allocation domain of the output view the output as a 1d tensor.
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -453,15 +444,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Stride mismatch with contiguity info")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -471,8 +462,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC1d) {
 // A global->global copy kernel where both inputs are NHWC memory format. The
 // allocation domain view both the input and the output as a 1d tensors.
 TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -514,15 +504,15 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "splitting one dimension into discontiguous dimensions is not")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -533,8 +523,7 @@ TEST_F(AllocationDomainTest, NHWC1d_To_NHWC1d) {
 // allocation domain view the input as a 2d tensor of shape [N*H/8, 8*W*C], and
 // view the output as a 2d tensor of shape [N*H*W*C/4, 4]
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -582,15 +571,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "splitting one dimension into discontiguous dimensions is not allowed in allocation domain")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -599,8 +588,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d) {
 
 // Similar to NHWC4d_To_NHWC4d, but does a cacheBefore
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -647,15 +635,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Stride mismatch with contiguity info")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -664,8 +652,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheBefore) {
 
 // Similar to NHWC2d_To_NHWC2d, but does a cacheBefore
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -724,15 +711,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "splitting one dimension into discontiguous dimensions is not allowed in allocation domain")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -741,8 +728,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheBefore) {
 
 // Similar to NHWC4d_To_NHWC4d, but does a cacheAfter
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -789,15 +775,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Stride mismatch with contiguity info")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -808,8 +794,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheAfter) {
 // allocation tensor to be between rFactor domain and loop domain, which is not
 // the case for NHWC2d_To_NHWC2d
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -860,15 +845,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "merging of discontiguous dimensions is not allowed in allocation domain")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -877,8 +862,7 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheAfter) {
 
 // Similar to NHWC4d_To_NHWC4d, but does a cacheFork
 TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(4);
@@ -932,15 +916,15 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Stride mismatch with contiguity info")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -949,8 +933,7 @@ TEST_F(AllocationDomainTest, NHWC4d_To_NHWC4d_cacheFork) {
 
 // Similar to NHWC2d_To_NHWC2d, but does a cacheFork
 TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   int n = 31, h = 64, w = 103, c = 21;
@@ -1022,15 +1005,15 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
   at::Tensor t0 =
       t0_wrong_format.as_strided({n, c, h, w}, {h * w * c, 1, w * c, c});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0_wrong_format}); },
+      [&]() { ke.run({t0_wrong_format}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "splitting one dimension into discontiguous dimensions is not allowed in allocation domain")));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   ASSERT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
 
@@ -1038,30 +1021,29 @@ TEST_F(AllocationDomainTest, NHWC2d_To_NHWC2d_cacheFork) {
 }
 
 TEST_F(AllocationDomainTest, VectorizationIssue902) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto& fusion = *fusion_ptr;
-  FusionGuard fg(fusion_ptr.get());
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   const std::vector<int64_t> shape({16, 16, 512, 64});
 
   auto tv0 = makeContigTensor(4);
-  fusion.addInput(tv0);
+  fusion->addInput(tv0);
 
   auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
+  fusion->addOutput(tv1);
 
-  std::vector<nvfuser::IterDomain*> aloc_domain;
-  aloc_domain.push_back(tv1->axis(0));
-  aloc_domain.push_back(tv1->axis(2));
-  aloc_domain.push_back(tv1->axis(3));
-  aloc_domain.push_back(tv1->axis(1));
-  tv1->setAllocationDomain(aloc_domain, true);
+  std::vector<nvfuser::IterDomain*> alloc_domain;
+  alloc_domain.push_back(tv1->axis(0));
+  alloc_domain.push_back(tv1->axis(2));
+  alloc_domain.push_back(tv1->axis(3));
+  alloc_domain.push_back(tv1->axis(1));
+  tv1->setAllocationDomain(alloc_domain, true);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   ASSERT_TRUE(cg_outputs[0].equal(t0));
@@ -1101,9 +1083,8 @@ TEST_F(AllocationDomainTest, TransposeMatrix) {
 }
 
 TEST_F(AllocationDomainTest, ContiguityIssue1021) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion* fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   auto tv0 = TensorViewBuilder()
                  .ndims(2)
@@ -1119,17 +1100,16 @@ TEST_F(AllocationDomainTest, ContiguityIssue1021) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({8, 8}, options).as_strided({4, 8}, {1, 8});
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs({t0});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({t0});
 
   auto t1 = t0.add(5.0);
-  testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(AllocationDomainTest, ContiguityForBroadcast) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion* fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   auto tv0 = TensorViewBuilder()
                  .ndims(2)
@@ -1145,17 +1125,16 @@ TEST_F(AllocationDomainTest, ContiguityForBroadcast) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({1, 1}, options).as_strided({1, 1}, {0, 3});
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs({t0});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({t0});
 
   auto t1 = t0.add(5.0);
-  testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion* fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
 
   auto tv0 = TensorViewBuilder()
                  .ndims(3)
@@ -1172,11 +1151,11 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({4, 8}, options).as_strided({3, 8, 4}, {0, 1, 8});
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs({t0});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({t0});
 
   auto t1 = t0.add(5.0);
-  testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 // Test that allocation domain can be used to vectorize overlapping tensors,
@@ -1189,8 +1168,7 @@ TEST_F(AllocationDomainTest, ContiguityForExplicitBroadcast) {
 // automatically supports all kinds of use cases, even those that we don't have
 // an active plan to support on).
 TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
+  Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeContigTensor(3);
@@ -1225,9 +1203,9 @@ TEST_F(AllocationDomainTest, VectorizeOverlappingTensor) {
   at::Tensor t0 =
       at::randn({4 * 5 * 7}).cuda().as_strided({4, 5, 7}, {7, 4, 1});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -1250,14 +1228,14 @@ TEST_F(AllocationDomainTest, Issue1290_ContiguityWasMissing) {
 
   at::Tensor in_tensor = at::randn({2 * 4}).cuda().as_strided({2, 3}, {4, 1});
 
-  FusionExecutorCache fec(std::move(fusion));
-  fec.runFusionWithInputs({in_tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  executor_cache.runFusionWithInputs({in_tensor});
 
   // The initial issue was detected in the pointwise scheduler, so I added these
   // checks to make sure it's a valid regression test. The transpose scheduler
   // could accept this but decided not to because of a small problem size.
   const std::vector<SegmentedGroup*>& groups =
-      fec.getMostRecentKernelRuntime()->fusionSegments()->groups();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
   ASSERT_EQ(groups.size(), 1);
   SegmentedGroup* group = groups[0];
   EXPECT_EQ(group->schedulerType(), SchedulerType::PointWise);
@@ -1275,9 +1253,9 @@ TEST_F(AllocationDomainTest, Issue1290_ReplayCasPFailedDueToDifferentRanks) {
   out->cacheBefore();
 
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {in_tensor});
-  at::Tensor out_tensor = fe.runFusion({in_tensor})[0];
+  KernelExecutor ke;
+  ke.compile(&fusion, {in_tensor});
+  at::Tensor out_tensor = ke.run({in_tensor})[0];
   EXPECT_THAT(out_tensor.sizes(), ElementsAre(2));
 }
 
@@ -1311,8 +1289,8 @@ TEST_F(AllocationDomainTest, Issue1524) {
       {permute_out->axis(1), permute_out->axis(0)}, true);
 
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  FusionExecutorCache fec(std::move(fusion));
-  fec.runFusionWithInputs({in_tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  executor_cache.runFusionWithInputs({in_tensor});
 }
 
 TEST_F(AllocationDomainTest, EmptyAllocationDomainApi) {
diff --git a/tests/cpp/test_allocation_order_inference.cpp b/tests/cpp/test_allocation_order_inference.cpp
index b936d2252d0..c24d679bfbb 100644
--- a/tests/cpp/test_allocation_order_inference.cpp
+++ b/tests/cpp/test_allocation_order_inference.cpp
@@ -315,9 +315,9 @@ TEST_F(AllocationOrderInferenceTest, EnableInRuntime) {
   at::Tensor in_tensor = at::randn({2, 4, 8, 8}, options);
   at::Tensor in_nhwc =
       in_tensor.as_strided({2, 4, 8, 8}, {4 * 8 * 8, 1, 4 * 8, 4});
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
-  auto cg_outputs = fec.runFusionWithInputs({in_nhwc});
+  auto cg_outputs = executor_cache.runFusionWithInputs({in_nhwc});
   auto ref_out = in_nhwc.relu();
 
   EXPECT_TRUE(cg_outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
diff --git a/tests/cpp/test_circular_buffering.cpp b/tests/cpp/test_circular_buffering.cpp
index d607579196f..cb0d27eafd2 100644
--- a/tests/cpp/test_circular_buffering.cpp
+++ b/tests/cpp/test_circular_buffering.cpp
@@ -64,17 +64,17 @@ TEST_P(CircularBufferingTest, SingleDim1) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1000}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis 1, the axis_extent is I0/128.
   constexpr int64_t axis_extent = 8;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 1;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -112,17 +112,17 @@ TEST_P(CircularBufferingTest, SingleDim2) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1000}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis 1, the axis_extent is I0/128.
   constexpr int64_t axis_extent = 8;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 1;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -167,17 +167,17 @@ TEST_P(CircularBufferingTest, SingleDim3) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1000}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis 2, the axis_extent is 128/32.
   constexpr int64_t axis_extent = 4;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 2;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -219,18 +219,18 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch1) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1000}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis
   // extent is 4.
   constexpr int64_t axis_extent = 4;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 2;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -271,18 +271,18 @@ TEST_P(CircularBufferingTest, SingleDimUnswitch2) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1000}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis -1 and axis 3 is parallelized with TIDx, the axis
   // extent is 4.
   constexpr int64_t axis_extent = 4;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 1;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -325,18 +325,18 @@ TEST_P(CircularBufferingTest, SingleDimUnroll) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({199}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis -1 and axis 4 is parallelized with TIDx, the axis
   // extent is 2.
   constexpr int64_t axis_extent = 2;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 2;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -372,18 +372,18 @@ TEST_P(CircularBufferingTest, SingleDimVectorize) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({200}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis 2 and axis 1 is parallelized with TIDx, the axis
   // extent is I0/128.
   constexpr int64_t axis_extent = 2;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 1;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -424,17 +424,17 @@ TEST_P(CircularBufferingTest, MultipleTensors) {
   auto t0 = at::randn({500}, options);
   auto t1 = at::randn({500}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
 
   // Given computeAt axis 1, the axis extent is I0/32/4.
   constexpr int64_t axis_extent = 1;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto ref = t0 + t1;
   testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
 }
@@ -475,19 +475,19 @@ TEST_P(CircularBufferingTest, NestedTensors) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1001}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // Given computeAt axis 1 for tv2, the axis extent is I0/32/4 = 8.
   // Given computeAt axis 3 for tv3 and axis 3 is parallelized with TIDx,
   // the axis extent is 4.
   constexpr int64_t axis_extent = 4;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 1;
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -569,16 +569,16 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   constexpr int64_t axis_extent = 2;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
   // The smem cache write in this test case is redundant predicated,
@@ -586,7 +586,7 @@ TEST_P(CircularBufferingTest, SmemBlockGemmCache) {
   //   insertion to ensure ordering of circular buffered tensor access.
   // The check below makes sure that the sync is inserted so that the
   //   test isn't running on a race condition.
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count > 0);
 }
 
 // Vectorized reset test for circular buffered registers
@@ -623,16 +623,16 @@ TEST_P(CircularBufferingTest, Vector) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   auto t0 = at::randn({200}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   constexpr int64_t axis_extent = 8;
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = (t0 + 1).sum({0});
   testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
@@ -678,14 +678,14 @@ TEST_P(CircularBufferingTest, CpAsync1) {
   at::Tensor t0 = at::randn({m, n}, options);
   at::Tensor t1 = at::randn({m, n}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // requires ampere+ GPU
   if (!deviceMajorMinorCheck(8)) {
-    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
+    ASSERT_ANY_THROW(ke.compile(&fusion, {t0, t1}));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   }
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto ref = t0 + t1;
 
@@ -731,14 +731,14 @@ TEST_P(CircularBufferingTest, CpAsync2) {
   at::Tensor t0 = at::randn({m, n}, options);
   at::Tensor t1 = at::randn({m, n}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // requires ampere+ GPU
   if (!deviceMajorMinorCheck(8)) {
-    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
+    ASSERT_ANY_THROW(ke.compile(&fusion, {t0, t1}));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   }
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto ref = t0 + t1;
 
@@ -794,9 +794,9 @@ TEST_P(CircularBufferingTest, NoSync) {
       });
   NVF_ERROR(!sync_inserted, "Un-expected block sync inserted");
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto ref = t0 + t1;
 
@@ -971,9 +971,9 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) {
   // (threadIdx.x < 4) predicate. This thread predicate is incompatible with
   // circular buffering because we generate an ElectSync predicate that uses
   // a single thread.
-  FusionExecutor fe;
+  KernelExecutor ke;
   try {
-    fe.compileFusion(fusion.get(), {t0});
+    ke.compile(fusion.get(), {t0});
   } catch (const std::exception& e) {
     const char* reference =
         R"(This thread-parallelized TensorView T2_s_float[ iblockIdx.x15{( ceilDiv(( ceilDiv(( ceilDiv(( ( ( (( (( getMetaData(T0) )).logical_size ))[0] ) * ( (( (( getMetaData(T0) )).logical_size ))[1] ) ) * ( (( (( getMetaData(T0) )).logical_size ))[2] ) ), 256) ), 4) ), 2) )}, iS16{2}, ithreadIdx.x14{4}, iB12{256} ] ca_pos( 2 ) is incorrectly contained within a If-Then-Else with the ElectSync predicate.)";
@@ -1023,10 +1023,10 @@ TEST_P(TmaCircularBufferingTest, SingleDim) {
   at::Tensor t0 = at::randn({tensor_inner_dim}, options);
   at::Tensor t1 = at::exp(t0);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0});
   compare<float>(tensor_inner_dim, cg_outputs.front(), t1);
   testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
 }
@@ -1076,17 +1076,17 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {
   at::Tensor t0 = at::randn({tensor_inner_dim}, options);
   at::Tensor t1 = at::exp(t0);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
 
   int64_t axis_extent =
       ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim);
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0});
   compare<float>(tensor_inner_dim, cg_outputs.front(), t1);
   testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
 }
@@ -1136,17 +1136,17 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {
   at::Tensor t0 = at::randn({tensor_inner_dim}, options);
   at::Tensor t1 = at::exp(t0);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
 
   int64_t axis_extent =
       ceilDiv(ceilDiv(tensor_inner_dim, bulk_inner_dim), unroll_dim);
   if (axis_extent < number_of_stages) {
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
     return;
   }
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0});
   compare<float>(tensor_inner_dim, cg_outputs.front(), t1);
   testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
 }
@@ -1206,10 +1206,10 @@ TEST_P(TmaCircularBufferingTest, MultiDim) {
   at::Tensor t0 = at::ones({tensor_outer_dim, tensor_inner_dim}, options);
   at::Tensor t1 = at::exp(t0);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0});
   compare<float>(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t1);
   testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
 }
@@ -1268,10 +1268,10 @@ TEST_P(TmaCircularBufferingTest, Pointwise) {
   at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
   at::Tensor t2 = t0 + t1;
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0, t1});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
   compare<float>(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2);
   testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
 }
@@ -1335,10 +1335,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {
   at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
   at::Tensor t2 = t0 + t1;
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0, t1});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
   compare<float>(tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), t2);
   testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
 }
@@ -1393,10 +1393,10 @@ TEST_P(TmaCircularBufferingTest, Reduction) {
   at::Tensor t0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
   at::Tensor t1 = sum(t0, {-1});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0});
   compare<float>(tensor_outer_dim, cg_outputs.front(), t1);
   testValidate(fusion.get(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
 }
@@ -1518,10 +1518,10 @@ TEST_P(TmaCircularBufferingTest, Persistent) {
   at::Tensor at_tv0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
   at::Tensor at_tv1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
 
-  // Compile with FusionExecutor directly to avoid scheduling
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {at_tv0});
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({at_tv0});
+  // Compile with KernelExecutor directly to avoid scheduling
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {at_tv0});
+  std::vector<at::Tensor> cg_outputs = ke.run({at_tv0});
 
   std::tuple<at::Tensor, at::Tensor> at_var_mean =
       at::var_mean(at_tv0, {-1}, correction, keepdim);
@@ -1640,10 +1640,10 @@ TEST_P(TmaCircularBufferingTest, Matmul) {
   at::Tensor aten_output =
       (t0.unsqueeze(/*dim=*/-1) * t1.unsqueeze(/*dim=*/0)).sum(/*dim=*/1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0, t1});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
   compare<float>(
       tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output);
   testValidate(
@@ -1754,10 +1754,10 @@ TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) {
   at::Tensor t1 = at::randn({1, K, tensor_inner_dim}, options);
   at::Tensor aten_output = (t0 * t1).sum(/*dim=*/1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1});
 
-  std::vector<at::Tensor> cg_outputs = fe.runFusion({t0, t1});
+  std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
   compare<float>(
       tensor_outer_dim, tensor_inner_dim, cg_outputs.front(), aten_output);
   testValidate(
diff --git a/tests/cpp/test_combined_inner_outer_reduction.cpp b/tests/cpp/test_combined_inner_outer_reduction.cpp
index 95eaadd4ad7..f0a90168cc4 100644
--- a/tests/cpp/test_combined_inner_outer_reduction.cpp
+++ b/tests/cpp/test_combined_inner_outer_reduction.cpp
@@ -104,10 +104,10 @@ TEST_P(CombinedSchedulerTest, LayerNormBackward) {
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   std::vector<c10::IValue> aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   auto aten_gradients = at::native_layer_norm_backward(
       aten_grad_out,
@@ -120,7 +120,7 @@ TEST_P(CombinedSchedulerTest, LayerNormBackward) {
       {true, true, true});
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       {cg_outputs[0], cg_outputs[1], cg_outputs[2]},
       aten_inputs,
       {std::get<0>(aten_gradients),
@@ -261,7 +261,7 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) {
     auto aten_mean = std::get<1>(aten_results);
     auto aten_rstd = std::get<2>(aten_results);
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
     std::vector<c10::IValue> aten_inputs = {
         aten_grad_out,
         aten_input,
@@ -269,7 +269,7 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) {
         aten_rstd,
         aten_weight,
         aten_bias};
-    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+    auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
     auto aten_gradients = at::native_layer_norm_backward(
         aten_grad_out.to(at::kDouble),
@@ -287,7 +287,8 @@ TEST_F(CombinedSchedulerTest, SharedConsumer) {
     if (!link_inner_outer) {
       aten_out_linked = aten_out_linked.mul(0.5);
     }
-    bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented();
+    bool is_segmented =
+        executor_cache.getMostRecentKernelRuntime()->isSegmented();
     NVF_CHECK(is_segmented, "Fusion is not segmented");
 
     testValidate(
@@ -443,7 +444,7 @@ TEST_F(CombinedSchedulerTest, SharedProducer) {
     auto aten_mean = std::get<1>(aten_results);
     auto aten_rstd = std::get<2>(aten_results);
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
     std::vector<c10::IValue> aten_inputs = {
         aten_grad_out,
         aten_input,
@@ -451,9 +452,9 @@ TEST_F(CombinedSchedulerTest, SharedProducer) {
         aten_rstd,
         aten_weight,
         aten_bias};
-    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+    auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-    FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+    FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
     switch (case_id) {
       case 0:
       case 1:
@@ -634,9 +635,9 @@ TEST_F(CombinedSchedulerTest, CombinedReduction) {
 
   at::Tensor qv_cg_output = at::empty({dim1}, options);
   auto qv_aten_output = tv_input.to(at::kFloat).sum({0});
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params);
-  fe.runFusion(
+  KernelExecutor ke;
+  ke.compile(&fusion, {tv_input}, launch_constraints, compile_params);
+  ke.run(
       {tv_input},
       {tv_cg_output, qv_cg_output},
       launch_constraints,
@@ -811,9 +812,9 @@ TEST_F(CombinedSchedulerTest, CombinedReductionMultiPerBlock) {
   at::Tensor qv_cg_output = at::empty({dim1}, options);
   at::Tensor tv_input2 = at::ones({dim0, dim1}, options);
   auto qv_aten_output = tv_input2.to(at::kFloat).sum({0});
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {tv_input}, launch_constraints, compile_params);
-  fe.runFusion(
+  KernelExecutor ke;
+  ke.compile(&fusion, {tv_input}, launch_constraints, compile_params);
+  ke.run(
       {tv_input},
       {tv_cg_output, qv_cg_output},
       launch_constraints,
@@ -850,10 +851,11 @@ TEST_F(CombinedSchedulerTest, InnerOuterMismatch) {
     at::Tensor t0 = at::randn({x, y, z}, options);
     std::vector<c10::IValue> aten_inputs = {t0};
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
-    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-    bool is_segmented = fec.getMostRecentKernelRuntime()->isSegmented();
+    bool is_segmented =
+        executor_cache.getMostRecentKernelRuntime()->isSegmented();
     if (outer_reduction_axis.size() == 2) {
       NVF_ERROR(!is_segmented, "Fusion should NOT be segmented!");
     } else {
@@ -980,8 +982,8 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) {
   heuristic_params->as<ReductionParams>()->smem_persistent_buffers =
       std::vector<TensorView*>{tv1};
   scheduler->schedule(&fusion, heuristic_params.get());
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   for (auto tv : fusion.allTvs()) {
     if (tv->getMemoryType() == MemoryType::Shared) {
@@ -990,8 +992,8 @@ TEST_F(CombinedSchedulerTest, SharedMemoryPersistentVectFactor) {
       }
     }
   }
-  auto cg_outputs = fe.runFusion(
-      aten_inputs, heuristic_params->as<ReductionParams>()->lparams);
+  auto cg_outputs =
+      ke.run(aten_inputs, heuristic_params->as<ReductionParams>()->lparams);
   testValidate(&fusion_copy, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
 
diff --git a/tests/cpp/test_dynamic_transform.cpp b/tests/cpp/test_dynamic_transform.cpp
index ca29bf0825b..8eb468999b7 100644
--- a/tests/cpp/test_dynamic_transform.cpp
+++ b/tests/cpp/test_dynamic_transform.cpp
@@ -209,10 +209,10 @@ TEST_F(NVFuserTest, DynamicTransform3_CUDA) {
   at::Tensor t1 = at::randn(shape_after, options);
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
 
-  testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__);
 }
 
 // Test multiple patterns of reshape
@@ -777,13 +777,13 @@ void reductionDynamicViewAddFusion(
                                       : add(x_reshape, bias);
   fusion.addOutput(y);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
-  size_t num_concretizations = fusion_executor_cache.countConcretizations();
+  size_t num_concretizations = executor_cache.countConcretizations();
   // Check that concretizations and runtimes are cache misses only when they
   // should be
   auto checkCache = [&](bool expect_miss) {
-    auto current = fusion_executor_cache.countConcretizations();
+    auto current = executor_cache.countConcretizations();
     ASSERT_EQ(current, num_concretizations + (size_t)expect_miss);
     num_concretizations = current;
   };
@@ -830,7 +830,7 @@ void reductionDynamicViewAddFusion(
       aten_inputs.emplace_back(output_shape[i]);
     }
 
-    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+    auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
     checkCache(expect_miss);
 
     auto at_tv1 = (reshape_before_reduction) ? (at_x + at_bias)
@@ -902,22 +902,22 @@ void reductionDynamicPadAddFusion(
   auto y = sum(x_pad, {kReductionAxis});
   fusion.addOutput(y);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   // Check that concretizations and runtimes are cache misses only when they
   // should be
-  size_t num_concretizations = fusion_executor_cache.getKernelRuntimes().size();
-#define CHECK_CACHE(expect_miss, ...)                              \
-  auto current = fusion_executor_cache.getKernelRuntimes().size(); \
-  auto expected = num_concretizations + (size_t)expect_miss;       \
-  NVF_CHECK(                                                       \
-      current == expected,                                         \
-      "Expected cache size ",                                      \
-      expected,                                                    \
-      " but found ",                                               \
-      current,                                                     \
-      ". ",                                                        \
-      __VA_ARGS__);                                                \
+  size_t num_concretizations = executor_cache.getKernelRuntimes().size();
+#define CHECK_CACHE(expect_miss, ...)                        \
+  auto current = executor_cache.getKernelRuntimes().size();  \
+  auto expected = num_concretizations + (size_t)expect_miss; \
+  NVF_CHECK(                                                 \
+      current == expected,                                   \
+      "Expected cache size ",                                \
+      expected,                                              \
+      " but found ",                                         \
+      current,                                               \
+      ". ",                                                  \
+      __VA_ARGS__);                                          \
   num_concretizations = current;
 
   for (auto& inv : invocations) {
@@ -943,7 +943,7 @@ void reductionDynamicPadAddFusion(
       aten_inputs.emplace_back(pad_widths[i]);
     }
 
-    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+    auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
     CHECK_CACHE(
         expect_miss, "Input shape=", input_shape, " pad_widths=", pad_widths);
 
@@ -1011,11 +1011,11 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) {
   // concretized to Iteration, it does not wind up overwriting the Broadcast
   // logical.
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at0 = at::randn({5}, options);
   std::vector<c10::IValue> aten_inputs = {at0};
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
 
@@ -1037,13 +1037,13 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat1_CUDA) {
   fusion.addOutput(tv3);
 
   // Check correctness
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at0 = at::randn({5}, options);
   at::Tensor at1 = at::randn({0}, options);
   at::Tensor at2 = at::randn({3}, options);
   std::vector<c10::IValue> aten_inputs = {at0, at1, at2};
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
 
@@ -1063,16 +1063,16 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat2_CUDA) {
   fusion.addOutput(tv2);
 
   // Check correctness
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at0 = at::randn({5}, options);
   at::Tensor at1 = at::randn({0}, options);
   std::vector<c10::IValue> aten_inputs = {at0, at1};
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 
   // Check that fusion consists only of tv2 = set(tv0)
-  auto fkr = fusion_executor_cache.getMostRecentKernelRuntime();
+  auto fkr = executor_cache.getMostRecentKernelRuntime();
   auto seg_fusion = fkr->fusionSegments();
   auto output_def = seg_fusion->outputs()[0]->definition();
   EXPECT_TRUE(output_def->isA<LoadStoreOp>());
@@ -1098,15 +1098,15 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) {
   fusion->addOutput(vm.mean);
   fusion->addOutput(vm.var);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at0 = at::randn({256, 128, 28, 28}, options);
   std::vector<c10::IValue> aten_inputs = {at0, 32};
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(
-      fusion_executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__);
+      executor_cache.fusion(), outputs, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, Issue249_CUDA) {
@@ -1126,15 +1126,14 @@ TEST_F(NVFuserTest, Issue249_CUDA) {
   auto tv3 = add(tv2, tv2);
   fusion.addOutput(tv3);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
 
-  auto outputs = fusion_executor_cache.runFusionWithInputs({at_x});
+  auto outputs = executor_cache.runFusionWithInputs({at_x});
 
-  testValidate(
-      fusion_executor_cache.fusion(), outputs, {at_x}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {at_x}, __LINE__, __FILE__);
 }
 
 // This is just like the test above, but uses an input scalar with value -1
@@ -1158,7 +1157,7 @@ TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) {
   auto tv3 = add(tv2, tv2);
   fusion.addOutput(tv3);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
@@ -1166,18 +1165,13 @@ TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) {
   // Dynamic reshape sizes that are not constant at definition must be explicit:
   // no -1 allowed
   EXPECT_THROW(
-      fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, -1}),
-      std::exception);
+      executor_cache.runFusionWithInputs({at_x, 2, 4, -1}), std::exception);
 
   // Passing explicit sizes works fine
-  auto outputs = fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, 15});
+  auto outputs = executor_cache.runFusionWithInputs({at_x, 2, 4, 15});
 
   testValidate(
-      fusion_executor_cache.fusion(),
-      outputs,
-      {at_x, 2, 4, 15},
-      __LINE__,
-      __FILE__);
+      executor_cache.fusion(), outputs, {at_x, 2, 4, 15}, __LINE__, __FILE__);
 }
 
 // Test that OptOutMutator mutates expressions in a predictable way
@@ -1215,10 +1209,10 @@ TEST_F(NVFuserTest, OptOutMutatorMutatedOutput) {
 
   inlineMost();
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
+  KernelExecutor ke;
+  ke.compile(fusion);
 
-  auto outputs = fe.runFusion({t0});
+  auto outputs = ke.run({t0});
 
   testValidate(fusion, outputs, {t0}, __LINE__, __FILE__);
 }
@@ -1252,10 +1246,10 @@ TEST_F(NVFuserTest, OptOutMutatorRedefinedConstant) {
 
   inlineMost();
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
+  KernelExecutor ke;
+  ke.compile(fusion);
 
-  auto outputs = fe.runFusion({3L});
+  auto outputs = ke.run({3L});
 
   testValidate(fusion, outputs, {3L}, __LINE__, __FILE__);
 }
@@ -1281,7 +1275,7 @@ TEST_F(NVFuserTest, SymbolicSqueeze) {
       tv1, std::vector<bool>({false, true})); // Squeeze second dimension
   fusion->addOutput(tv2);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({3, 2}, options);
@@ -1289,14 +1283,14 @@ TEST_F(NVFuserTest, SymbolicSqueeze) {
   // An invalid input has a second dimension that cannot be squeezed
   std::vector<c10::IValue> invalid_inputs = {t0, 2, 3};
 
-  auto outputs = fec.runFusionWithInputs(valid_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(valid_inputs);
 
   testValidate(fusion, outputs, valid_inputs, __LINE__, __FILE__);
 
   // An informative error message should be given by
   // SqueezeOp::checkConcretization
   EXPECT_THAT(
-      [&]() { fec.runFusionWithInputs(invalid_inputs); },
+      [&]() { executor_cache.runFusionWithInputs(invalid_inputs); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           " must concretize to IterType::Broadcast but found")));
 }
@@ -1325,7 +1319,7 @@ TEST_F(NVFuserTest, SymbolicExpand) {
 
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({3, 2}, options);
@@ -1333,13 +1327,14 @@ TEST_F(NVFuserTest, SymbolicExpand) {
   // An invalid input has a second dimension that cannot be expanded
   std::vector<c10::IValue> invalid_inputs = {t0, 2, 3, 2, 5};
 
-  auto outputs = fec.runFusionWithInputs(valid_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(valid_inputs);
 
-  testValidate(fec.fusion(), outputs, valid_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), outputs, valid_inputs, __LINE__, __FILE__);
 
   // An informative error message should be given during concretization
   EXPECT_THAT(
-      [&]() { fec.runFusionWithInputs(invalid_inputs); },
+      [&]() { executor_cache.runFusionWithInputs(invalid_inputs); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Mismatch in sizes when concretizing expand.")));
 }
@@ -1380,13 +1375,13 @@ TEST_F(NVFuserTest, ConcretizeConstantExtents) {
 
   fusion->addOutput(tv5);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({4096, 12288}, options);
   std::vector<c10::IValue> inputs = {t0};
 
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   testValidate(fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -1417,13 +1412,13 @@ TEST_F(NVFuserTest, DynamicSqueezeTrivialReduction) {
   auto tv2 = sum(tv1, {0, 2, 3, 4});
   fusion->addOutput(tv2);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({2, 2, 9}, options);
   std::vector<c10::IValue> inputs = {t0};
 
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   testValidate(fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -1455,13 +1450,13 @@ TEST_F(NVFuserTest, DynamicSqueezeTrivialWelford) {
   fusion->addOutput(res.mean);
   fusion->addOutput(res.var);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({2, 2, 9}, options);
   std::vector<c10::IValue> inputs = {t0};
 
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   testValidate(fusion, outputs, inputs, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_external_src.cpp b/tests/cpp/test_external_src.cpp
index f0623ade609..21d487f17b4 100644
--- a/tests/cpp/test_external_src.cpp
+++ b/tests/cpp/test_external_src.cpp
@@ -28,7 +28,7 @@ class ExternalSrcExample : public NVFuserTest {};
 TEST_F(ExternalSrcExample, Reduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   // By default, this env var should not be defined. To test using an
   // external source file, set it to the path to the external source
@@ -44,7 +44,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) {
   buffer << cuda_src.rdbuf();
   std::string cuda_src_str = buffer.str();
 
-  fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32);
+  ke.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32);
 
   // The following is a sample launch pattern of the compiled
   // kernel. It must be adapted for each particular source file.
@@ -82,7 +82,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) {
     clearL2Cache();
     std::cout << "Launching the kernel" << std::endl;
     float elapsed_time_ms =
-        fe.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32);
+        ke.runRtc(lp, {t0, t7, t14, t15, t16, t17}, PrimDataType::Int32);
     std::cout << "kernel run in " << elapsed_time_ms << " ms, achieved "
               << (read_write_bytes / elapsed_time_ms / 1000.0 / 1000.0)
               << " GB/s" << std::endl;
@@ -99,7 +99,7 @@ TEST_F(ExternalSrcExample, Reduction_CUDA) {
 TEST_F(ExternalSrcExample, Matmul_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   // By default, this env var should not be defined. To test using an
   // external source file, set it to the path to the external source
@@ -115,7 +115,7 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) {
   buffer << cuda_src.rdbuf();
   std::string cuda_src_str = buffer.str();
 
-  fe.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32);
+  ke.compileRtc(cuda_src_str, "kernel1", true, PrimDataType::Int32);
 
   int M = 2048, N = 3456, K = 2048;
   MmaLayout layout = MmaLayout::TN;
@@ -129,7 +129,7 @@ TEST_F(ExternalSrcExample, Matmul_CUDA) {
     auto output = at::zeros_like(at_output);
     clearL2Cache();
     std::cout << "Launching the kernel" << std::endl;
-    float elapsed_time_ms = fe.runRtc(
+    float elapsed_time_ms = ke.runRtc(
         lp, {inputs.first, inputs.second, output}, PrimDataType::Int32);
     std::cout << "kernel run in " << elapsed_time_ms << " ms." << std::endl;
 
diff --git a/tests/cpp/test_gpu1.cpp b/tests/cpp/test_gpu1.cpp
index f5ee88d0936..4ed46cddc46 100644
--- a/tests/cpp/test_gpu1.cpp
+++ b/tests/cpp/test_gpu1.cpp
@@ -207,9 +207,9 @@ TEST_F(NVFuserTest, FusionClear_CUDA) {
   at::Tensor input1 = at::randn({16, 8, 8}, options);
   at::Tensor input2 = at::randn_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  auto outputs = ke.run({input1, input2});
 
   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;
@@ -813,9 +813,9 @@ TEST_F(NVFuserTest, FusionOuterSplit_CUDA) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({});
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto outputs = ke.run({});
   const auto& output = outputs.at(0);
 
   at::Tensor output_ref = at::ones_like(output, options);
@@ -855,9 +855,9 @@ TEST_F(NVFuserTest, FusionCodeGen_CUDA) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({});
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto outputs = ke.run({});
   const auto& output = outputs.at(0);
 
   at::Tensor output_ref = at::ones_like(output, options);
@@ -899,9 +899,9 @@ TEST_F(NVFuserTest, FusionCodeGen2_CUDA) {
   at::Tensor input1 = at::randn({16, 8, 8}, options);
   at::Tensor input2 = at::randn_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  auto outputs = ke.run({input1, input2});
 
   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;
@@ -955,9 +955,9 @@ TEST_F(NVFuserTest, FusionSimplePWise_CUDA) {
   at::Tensor input2 = at::rand_like(input1);
   at::Tensor output = at::empty_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  ke.run({input1, input2}, {output});
 
   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;
@@ -1013,9 +1013,9 @@ TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) {
   at::Tensor input2 = at::rand_like(input1);
   at::Tensor output = at::empty_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  ke.run({input1, input2}, {output});
 
   at::Tensor tv2_ref = input2 + static_cast<c10::complex<double>>(scalar1);
   at::Tensor output_ref = input1 + tv2_ref;
@@ -1063,9 +1063,9 @@ TEST_F(NVFuserTest, FusionExecKernel_CUDA) {
   at::Tensor input1 = at::ones({1, 128}, options);
   at::Tensor input2 = at::ones_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  auto outputs = ke.run({input1, input2});
 
   at::Tensor check = at::full({1, 128}, 4, options);
   ;
@@ -1145,9 +1145,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) {
   std::vector<at::Tensor> cg_outputs = {
       at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1199,9 +1199,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({129, 127}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
@@ -1253,9 +1253,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) {
 
   at::Tensor cg_output = at::empty_like(t0, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -1317,9 +1317,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1353,9 +1353,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1388,9 +1388,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1449,9 +1449,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t2, t6};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1505,9 +1505,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t2, t6};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1574,9 +1574,9 @@ TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
   std::vector<at::Tensor> cg_outputs = {
       at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1644,9 +1644,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
       at::empty_like(aten_input, options),
       at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1719,9 +1719,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
 
   at::Tensor cg_output = at::empty_like(aten_input, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__);
 }
@@ -1800,9 +1800,9 @@ TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
   std::vector<at::Tensor> cg_outputs = {
       at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1864,9 +1864,9 @@ TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
       at::empty_like(aten_input, options),
       at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1972,9 +1972,9 @@ TEST_F(NVFuserTest, FusionScalarInputs_CUDA) {
       at::Scalar(fl2),
       at::Scalar(fl3)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -2024,9 +2024,9 @@ TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) {
   at::Tensor input0 = at::randn({129, 13, 3}, options);
   at::Tensor input1 = at::randn({129, 13, 3}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1});
-  auto outputs = fe.runFusion({input0, input1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input0, input1});
+  auto outputs = ke.run({input0, input1});
 
   NVF_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
 }
@@ -2173,9 +2173,9 @@ void test_op(
   std::vector<at::Tensor> output_vect = {cg_output};
   cudaDeviceSynchronize();
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs_ivalues);
-  fe.runFusion(aten_inputs_ivalues, output_vect);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs_ivalues);
+  ke.run(aten_inputs_ivalues, output_vect);
   cudaDeviceSynchronize();
 
   at::Tensor aten_output = af(aten_inputs);
@@ -2710,17 +2710,17 @@ TEST_F(NVFuserTest, FusionFp8CastOps_CUDA) {
       // const at::ArrayRef<c10::IValue> input_ivalues(inputs);
       std::vector<c10::IValue> inputs = {input1};
 
-      FusionExecutor fe;
+      KernelExecutor ke;
 
       if (!deviceMajorMinorCheck(9)) {
         ASSERT_THAT(
-            [&]() { fe.compileFusion(&fusion, inputs); },
+            [&]() { ke.compile(&fusion, inputs); },
             testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
                 "Reason: Fusion contains Float8_xxx values which was introduced in Hopper (9.0)")));
         GTEST_SKIP() << "skipping tests on pre-HOPPER GPUs";
       } else {
-        fe.compileFusion(&fusion, inputs);
-        auto outputs = fe.runFusion(inputs);
+        ke.compile(&fusion, inputs);
+        auto outputs = ke.run(inputs);
 
         at::Tensor ref_output = input1.to(at_fp8_type).to(at_src_type);
 
@@ -2790,9 +2790,9 @@ TEST_F(NVFuserTest, FusionReduction1_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1});
 
@@ -2862,9 +2862,9 @@ TEST_F(NVFuserTest, FusionReduction2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
@@ -2913,9 +2913,9 @@ TEST_F(NVFuserTest, FusionReduction3_CUDA) {
   at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   auto aten_output = aten_input.to(at::kDouble).sum({1});
 
@@ -2979,9 +2979,9 @@ TEST_F(NVFuserTest, FusionReduction4_CUDA) {
   at::Tensor t1 = at::randn({numel_x, numel_y}, options);
   at::Tensor t4 = at::randn({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t4});
-  auto cg_outputs = fe.runFusion({t0, t1, t4});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1, t4});
+  auto cg_outputs = ke.run({t0, t1, t4});
 
   auto t2 = t0.add(t1);
   auto t3 = t2.to(at::kDouble).sum({1});
@@ -3033,9 +3033,9 @@ TEST_F(NVFuserTest, FusionReduction5_CUDA) {
 
   at::Tensor cg_output = at::empty({bidy, tidx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(
@@ -3098,9 +3098,9 @@ TEST_F(NVFuserTest, FusionReduction6_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = input.to(at::kDouble).sum({1, 2});
   testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
@@ -3130,9 +3130,9 @@ TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
@@ -3154,9 +3154,9 @@ TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({4, 8}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_output = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_output = ke.run({input});
   testValidate(&fusion, cg_output, {input}, __LINE__, __FILE__);
 }
 
@@ -3207,9 +3207,9 @@ TEST_F(NVFuserTest, FusionReductionTFT_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(
@@ -3271,9 +3271,9 @@ TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) {
   at::Tensor t1 = at::randn({numel_x, numel_y}, options);
   at::Tensor t4 = at::randn({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t4});
-  auto cg_outputs = fe.runFusion({t0, t1, t4});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1, t4});
+  auto cg_outputs = ke.run({t0, t1, t4});
 
   auto t2 = t0.add(t1);
   auto t3 = t2.to(at::kDouble).sum({1});
@@ -3310,7 +3310,7 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) {
   at::Tensor t1 = at::randn({x, y}, options);
   at::Tensor t2 = at::randn({x, y}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   tv6->merge(0);
   tv6->split(0, 128);
   tv6->split(0, 4);
@@ -3331,8 +3331,8 @@ TEST_F(NVFuserTest, FusionBranches_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3377,9 +3377,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t2, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3429,9 +3429,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t4};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -3471,9 +3471,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) {
   std::vector<c10::IValue> aten_inputs = {t0, t2};
   at::Tensor cg_output = at::empty({x, y, z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -3516,9 +3516,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -3556,9 +3556,9 @@ TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -3608,9 +3608,9 @@ TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t3, t6};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3652,9 +3652,9 @@ TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) {
   at::Tensor t0 = at::randn({y, z}, options);
   at::Tensor t4 = at::randn({x, y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t4});
-  auto cg_outputs = fe.runFusion({t0, t4});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t4});
+  auto cg_outputs = ke.run({t0, t4});
 
   testValidate(&fusion, {cg_outputs}, {t0, t4}, __LINE__, __FILE__);
 }
@@ -3726,18 +3726,18 @@ TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) {
   at::Tensor t0 = at::randn({M, K}, options);
   at::Tensor t1 = at::randn({K, N}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
   // Lets specify a few bounds in launch params to make sure it works
-  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
+  ke.run({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
 
   // Make sure bad launch params throws
   // TODO: Re-enable once we have parallelization validation in.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
+  // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
 
   // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble));
 
@@ -3791,9 +3791,9 @@ TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) {
   at::Tensor cg_output = at::empty({dimx}, options);
   at::Tensor t3_output = at::empty_like(cg_output, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  fe.runFusion({t0}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  ke.run({t0}, {cg_output});
 
   auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false);
 
@@ -3860,9 +3860,9 @@ TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
   at::Tensor input = at::randn({dimx}, options);
   at::Tensor t3_output = at::empty({dimx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
 
@@ -3920,9 +3920,9 @@ TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) {
 
   at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
 
@@ -3995,9 +3995,9 @@ TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
   at::Tensor input = at::randn({dimx, dimy, dimz}, options);
   at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
 
@@ -4081,9 +4081,9 @@ TEST_F(NVFuserTest, FusionGridReduction1_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1});
 
@@ -4141,9 +4141,9 @@ TEST_F(NVFuserTest, FusionGridReduction2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = input.to(at::kDouble).sum({1});
 
@@ -4203,9 +4203,9 @@ TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(
@@ -4262,9 +4262,9 @@ TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = input.to(at::kDouble).sum({0});
 
@@ -4328,9 +4328,9 @@ TEST_F(NVFuserTest, FusionGridReduction4_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(
@@ -4385,9 +4385,9 @@ TEST_F(NVFuserTest, FusionGridReduction5_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
@@ -4450,9 +4450,9 @@ TEST_F(NVFuserTest, FusionGridReduction6_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({1, 2});
 
@@ -4482,9 +4482,9 @@ TEST_F(NVFuserTest, FusionGridReduction7_CUDA) {
   at::Tensor input = at::randn({numel_x}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto out = ke.run({input});
 
   testValidate(&fusion, out, {input}, __LINE__, __FILE__);
 }
@@ -4508,9 +4508,9 @@ TEST_F(NVFuserTest, FusionGridReduction8_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto out = ke.run({input});
 
   testValidate(&fusion, out, {input}, __LINE__, __FILE__);
 }
@@ -4545,9 +4545,9 @@ TEST_F(NVFuserTest, FusionGridReduction9_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_output = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_output = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_output, {t0, t2}, __LINE__, __FILE__);
 }
@@ -4586,9 +4586,9 @@ TEST_F(NVFuserTest, FusionGridReduction10_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_output = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_output = ke.run({t0});
 
   testValidate(&fusion, cg_output, {t0}, __LINE__, __FILE__);
 }
@@ -4616,9 +4616,9 @@ TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({16, bid_x * tid_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
@@ -4666,9 +4666,9 @@ TEST_F(NVFuserTest, FusionSplitBCast_CUDA) {
   at::Tensor t1 = at::randn({32, 32, 128}, options);
   at::Tensor cg_output = at::empty({32, 32, 128}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  fe.runFusion({t0, t1}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  ke.run({t0, t1}, {cg_output});
 }
 
 TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) {
@@ -4747,9 +4747,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::Tensor aten_input = at::randn({100}, options);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {aten_input});
-    auto cg_outputs = fe.runFusion({aten_input});
+    KernelExecutor ke;
+    ke.compile(&fusion, {aten_input});
+    auto cg_outputs = ke.run({aten_input});
 
     testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
   }
@@ -4778,9 +4778,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
 
   at::Tensor cg_output = at::empty_like(aten_input, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__);
 }
@@ -4808,9 +4808,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({dimx, dimy}, options);
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -4831,9 +4831,9 @@ TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({100}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -4866,9 +4866,9 @@ TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
   std::vector<c10::IValue> aten_inputs = {t0, t1};
   at::Tensor cg_output = at::empty({}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, {cg_output});
 
   testValidate(&fusion, {cg_output}, aten_inputs, __LINE__, __FILE__);
 }
@@ -4901,9 +4901,9 @@ TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) {
 
   at::Tensor cg_output = at::empty({}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   testValidate(
       &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
@@ -4953,9 +4953,9 @@ TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) {
   auto aten_output = t3.add(t4);
 
   std::vector<c10::IValue> aten_inputs = {t0, t4};
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t4});
-  auto cg_outputs = fe.runFusion({t0, t4});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t4});
+  auto cg_outputs = ke.run({t0, t4});
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
@@ -4977,9 +4977,9 @@ TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) {
 
   at::Tensor aten_input = at::randn({2, 3}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -5000,9 +5000,9 @@ TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) {
 
   at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -5076,9 +5076,9 @@ TEST_F(NVFuserTest, FusionSumTo_CUDA) {
 
   at::Tensor aten_input = at::randn(tensor_shape_ref, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   NVF_CHECK(
       cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
@@ -5118,9 +5118,9 @@ TEST_F(NVFuserTest, FusionSumToNoop_CUDA) {
 
   at::Tensor aten_input = at::randn(tensor_shape_ref, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   NVF_CHECK(
       cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
@@ -5265,9 +5265,9 @@ TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) {
 
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input}, lparams);
+  auto cg_outputs = ke.run({aten_input}, lparams);
 
   testValidate(
       &fusion,
@@ -5307,11 +5307,9 @@ TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
 
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::Reduction, {aten_input});
-  FusionExecutor fusion_executor;
-  fusion_executor.compileFusion(
-      &fusion, {aten_input}, heuristic_params->lparams);
-  fusion_executor.runFusion(
-      {aten_input}, {cg_output}, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input}, heuristic_params->lparams);
+  ke.run({aten_input}, {cg_output}, heuristic_params->lparams);
 
   testValidate(
       &fusion,
@@ -5538,9 +5536,9 @@ TEST_F(NVFuserTest, FusionCacheBefore_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({M, N}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -5574,9 +5572,9 @@ TEST_F(NVFuserTest, FusionCacheAfter_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({M, N}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -5616,9 +5614,9 @@ TEST_F(NVFuserTest, FusionCacheFork_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({M, N}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -5663,9 +5661,9 @@ TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -5719,9 +5717,9 @@ TEST_F(NVFuserTest, FusionCacheBcast_CUDA) {
   at::Tensor t1 = at::randn({N}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -5756,9 +5754,9 @@ TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({N}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -5808,13 +5806,13 @@ TEST_F(NVFuserTest, FusionSmem_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
@@ -5856,13 +5854,13 @@ TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
   at::Tensor aten_input = at::randn({M, K, N}, options);
   at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(
       &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
@@ -5926,14 +5924,14 @@ TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
   std::vector<c10::IValue> aten_inputs = {t0, t1};
   at::Tensor aten_output = at::matmul(t0.to(at::kDouble), t1.to(at::kDouble));
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
@@ -6015,14 +6013,14 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
@@ -6087,9 +6085,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
   at::Tensor aten_input = at::randn({dimx, dimy}, options);
   auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input, 128});
-  auto cg_outputs = fe.runFusion({aten_input, 128});
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, {aten_input, 128});
+  auto cg_outputs = ke.run({aten_input, 128});
 
   testValidate(
       &fusion,
@@ -6265,10 +6263,10 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) {
   auto aten_mean = std::get<1>(aten_results);
   auto aten_rstd = std::get<2>(aten_results);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> aten_inputs = {
       aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -6321,10 +6319,10 @@ TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) {
   auto var = at::mul(sum, 1.0 / NORM_SIZE);
   auto aten_rstd = at::pow(at::add(var, kEps), -0.5);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> aten_inputs = {
       aten_grad_out, aten_input, aten_rstd, aten_weight};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   auto in_mul_rstd = at::mul(aten_input, aten_rstd);
   auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd);
@@ -6383,9 +6381,9 @@ TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) {
 
   // tv11 and tv17 should not be predicated. See issue #496
   ASSERT_FALSE(PredicatedChecker::isPredicated(
-      11, cg_results.fusion_executor->kernel()));
+      11, cg_results.kernel_executor->kernel()));
   ASSERT_FALSE(PredicatedChecker::isPredicated(
-      17, cg_results.fusion_executor->kernel()));
+      17, cg_results.kernel_executor->kernel()));
 }
 
 TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) {
@@ -6842,10 +6840,9 @@ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA) {
   at::Tensor aten_dynamic_out =
       aten_output.narrow(1, static_size, dimy - static_size);
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in});
-  fe.runFusion(
-      {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out});
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, {aten_static_in, aten_dynamic_in});
+  ke.run({aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out});
 
   testValidate(
       &fusion,
@@ -7031,10 +7028,10 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
   std::vector<c10::IValue> aten_inputs = {
       aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy};
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
-  fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
+  ke.run(aten_inputs, {cg_static_out, cg_dynamic_out});
 
   auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
   auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1);
@@ -7155,9 +7152,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) {
   std::vector<c10::IValue> aten_inputs = {
       aten_input, kGamma, kBeta, kEps, dimy, TIDX};
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
@@ -7201,9 +7198,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
 
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input}, lparams);
+  auto cg_outputs = ke.run({aten_input}, lparams);
 
   testValidate(
       &fusion,
@@ -7214,7 +7211,7 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
       __FILE__,
       "",
       lparams);
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
@@ -7264,9 +7261,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
 
   auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input, runtime_threadIdx_dim}, lparams);
+  auto cg_outputs = ke.run({aten_input, runtime_threadIdx_dim}, lparams);
 
   testValidate(
       &fusion,
@@ -7278,7 +7275,7 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
       "",
       lparams);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 0);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
@@ -7328,14 +7325,14 @@ TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
 
   LaunchParams lparams(-1, -1, -1, BSX, -1, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs, lparams);
+  auto cg_outputs = ke.run(aten_inputs, lparams);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, __LINE__, __FILE__, "", lparams);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 1);
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
@@ -7453,15 +7450,15 @@ TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
   at::Tensor aten_output =
       mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // Generate CUDA and compile with nvRTC
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
 
-  NVF_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+  NVF_CHECK(ke.kernel()->summary().war_hazard_syncs_count == 1);
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu2.cpp b/tests/cpp/test_gpu2.cpp
index 5dae411ee12..2426d801176 100644
--- a/tests/cpp/test_gpu2.cpp
+++ b/tests/cpp/test_gpu2.cpp
@@ -94,9 +94,9 @@ TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) {
 
   auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input}, lparams);
-  auto cg_outputs = fe.runFusion({input}, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {input}, lparams);
+  auto cg_outputs = ke.run({input}, lparams);
 
   auto aten_output = input.to(at::kDouble).sum({1});
   testValidate(
@@ -141,9 +141,9 @@ TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t2, t3});
-  auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1, t2, t3});
+  auto cg_outputs = ke.run({t0, t1, t2, t3});
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -199,9 +199,9 @@ TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
 
   tv1->computeAt(tv2_rf, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto aten_output = (input + 0).to(at::kDouble).sum(1);
 
@@ -276,9 +276,9 @@ TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
   auto t3 = t1 + 3;
   auto t4 = t3 + 4;
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -310,9 +310,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) {
       at::empty_like(aten_input, options),
       at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
 
@@ -347,9 +347,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) {
       at::empty_like(aten_input, options),
       at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -399,9 +399,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) {
         at::empty_like(aten_input, options),
         at::empty_like(aten_input, options)};
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {aten_input});
-    fe.runFusion({aten_input}, cg_outputs);
+    KernelExecutor ke;
+    ke.compile(&fusion, {aten_input});
+    ke.run({aten_input}, cg_outputs);
 
     testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
   }
@@ -443,9 +443,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) {
       at::empty_like(t0, options),
       at::empty_like(t0, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  ke.run(aten_inputs, cg_outputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -476,9 +476,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) {
       at::empty_like(aten_input, options),
       at::empty_like(aten_input, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   auto t1 = aten_input + 1;
   auto t2 = t1 + 2;
@@ -518,9 +518,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) {
 
   at::Tensor cg_output = at::empty_like(aten_input, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__);
 }
@@ -558,9 +558,9 @@ TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) {
 
   at::Tensor cg_output = at::empty_like(aten_input, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__);
 }
@@ -619,9 +619,9 @@ TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) {
   std::vector<at::Tensor> cg_outputs = {
       at::empty_like(aten_input, options), at::empty({numel_x}, options)};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, cg_outputs);
 
   testValidate(
       &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
@@ -739,10 +739,10 @@ TEST_F(NVFuserTest, FusionReduceSingle_CUDA) {
   at::Tensor aten_input = at::randn({100, 1}, options);
 
   // Grab only tensor views, though there shouldn't be any other type
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
   // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -871,9 +871,9 @@ TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) {
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({10, 20, 1}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1281,9 +1281,9 @@ TEST_F(NVFuserTest, FusionIssue459_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1311,9 +1311,9 @@ TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
 
   auto aten_input = at::randn({12, 34}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1422,9 +1422,9 @@ TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) {
   // A, B, m_tile_dim, split_k, intra_cta_tile
   std::vector<c10::IValue> aten_inputs = {t0, t1, 3, 4, 5};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
@@ -1457,9 +1457,9 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
   at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  ke.run({aten_input}, {cg_output});
 
   testValidate(&fusion, {cg_output}, {aten_input}, __LINE__, __FILE__);
 }
@@ -1494,9 +1494,9 @@ TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
 
   at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1600,9 +1600,9 @@ TEST_F(NVFuserTest, FusionIssue367_CUDA) {
   at::Tensor aten_output =
       mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
@@ -1626,9 +1626,9 @@ TEST_F(NVFuserTest, FusionIssue468_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor aten_input = at::randn({10, 100}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1678,9 +1678,9 @@ TEST_F(NVFuserTest, FusionIssue363_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1704,9 +1704,9 @@ TEST_F(NVFuserTest, FusionIssue484_CUDA) {
 
   at::Tensor aten_input = at::randn({M, M}, options);
 
-  nvfuser::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  nvfuser::KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1730,9 +1730,9 @@ TEST_F(NVFuserTest, FusionIssue329_CUDA) {
   std::vector<int64_t> t0_shape{17, 19};
   auto aten_input = at::randn(t0_shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1771,9 +1771,9 @@ TEST_F(NVFuserTest, FusionIssue382_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1800,9 +1800,9 @@ TEST_F(NVFuserTest, FusionIssue507_CUDA) {
   std::vector<int64_t> t0_shape{17, 19};
   auto aten_input = at::randn(t0_shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(&fusion, cg_outputs, {aten_input}, __LINE__, __FILE__);
 }
@@ -1838,9 +1838,9 @@ TEST_F(NVFuserTest, FusionIssue532_CUDA) {
   at::Tensor t0 = at::randn({M}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1867,9 +1867,9 @@ TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) {
   at::Tensor t0 = at::randn({M}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1945,17 +1945,17 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) {
   // Lets specify a few bounds in launch params to make sure it works
   LaunchParams lparams(1, -1, -1, 32, 4, 4);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, lparams);
-  fe.runFusion({t0, t1}, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1}, lparams);
+  ke.run({t0, t1}, lparams);
 
   // Make sure bad launch params throws
   // TODO: Re-enable once we have parallelization validation in.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
+  // ASSERT_ANY_THROW(ke.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
 
   // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble));
 
@@ -1964,7 +1964,7 @@ TEST_F(NVFuserTest, FusionIssue549_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) {
-  FusionExecutor fe;
+  KernelExecutor ke;
   std::string kernel = R"(
 __global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
   if(threadIdx.x==0){
@@ -1974,7 +1974,7 @@ __global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
   }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       256, // gdimx
       1, // gdimy
@@ -1989,14 +1989,14 @@ __global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
   const std::vector<int64_t> tensor_dims = {8};
   auto in0 = at::randn(tensor_dims, options);
   auto out0 = at::empty_like(in0);
-  fe.runRtc(lp, {in0, out0}, PrimDataType::Int);
+  ke.runRtc(lp, {in0, out0}, PrimDataType::Int);
 
   auto out_ref = in0 * 2;
   NVF_CHECK(out_ref.allclose(out0));
 }
 
 TEST_F(NVFuserTest, FusionSerialWelford_CUDA) {
-  FusionExecutor fe;
+  KernelExecutor ke;
   int x = 128, y = 64, z = 64;
 
   std::string kernel = R"(
@@ -2030,7 +2030,7 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       1, // gdimx
       1, // gdimy
@@ -2046,14 +2046,14 @@ __global__ void kernel1(
   auto in0 = at::randn(tensor_dims, options);
   auto out_var = at::empty({x}, options);
   auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int);
+  ke.runRtc(lp, {in0, out_var, out_avg}, PrimDataType::Int);
 
   NVF_CHECK(in0.var({1, 2}, false).allclose(out_var));
   NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
 }
 
 TEST_F(NVFuserTest, FusionBlockWelford_CUDA) {
-  FusionExecutor fe;
+  KernelExecutor ke;
   int x = 7, y = 8, z = 9;
 
   std::string kernel = R"(
@@ -2102,7 +2102,7 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       1, // gdimx
       1, // gdimy
@@ -2129,7 +2129,7 @@ __global__ void kernel1(
   // run kernel
   auto out_var = at::zeros({x}, options);
   auto out_avg = at::zeros({x}, options);
-  fe.runRtc(
+  ke.runRtc(
       lp,
       {in0, out_avg, out_var, init_avg, init_var, init_N},
       PrimDataType::Int);
@@ -2142,7 +2142,7 @@ __global__ void kernel1(
 }
 
 TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
-  FusionExecutor fe;
+  KernelExecutor ke;
   int x = 7, y = 8, z = 9;
 
   // need support IValue for integer input as initial count
@@ -2183,7 +2183,7 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       1, // gdimx
       1, // gdimy
@@ -2199,14 +2199,14 @@ __global__ void kernel1(
   auto in0 = at::randn(tensor_dims, options);
   auto out_var = at::empty({x}, options);
   auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int);
+  ke.runRtc(lp, {in0, out_avg, out_var}, PrimDataType::Int);
 
   NVF_CHECK(in0.var({1, 2}, false).allclose(out_var));
   NVF_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
 }
 
 TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
-  FusionExecutor fe;
+  KernelExecutor ke;
   int x = 128, y = 64, z = 128;
 
   std::string kernel = R"(
@@ -2258,7 +2258,7 @@ __global__ void kernel1(
     }
 }
     )";
-  fe.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
+  ke.compileRtc(kernel, "kernel1", false, PrimDataType::Int);
   LaunchParams lp(
       x, // gdimx
       y, // gdimy
@@ -2282,7 +2282,7 @@ __global__ void kernel1(
   auto work_buf_var = at::empty({x * y * z}, options);
   auto work_buf_N = at::empty({x * y * z}, options_int);
   auto sync_flag = at::zeros({1}, options_int);
-  fe.runRtc(
+  ke.runRtc(
       lp,
       {in0,
        out_avg,
@@ -2325,15 +2325,15 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
   auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({M, N}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   // by default Welford outputs sum of square diff so need to divide to get var
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2370,15 +2370,15 @@ TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
   at::Tensor t_avg = at::empty({M}, options);
   at::Tensor t_N = at::empty({M}, options_int);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   // by default Welford outputs sum of square diff so need to divide to get var
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2415,15 +2415,15 @@ TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
   at::Tensor t_var = at::empty({M}, options);
   at::Tensor t_N = at::empty({M}, options_int);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   // by default Welford outputs sum of square diff so need to divide to get var
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2459,15 +2459,15 @@ TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
   at::Tensor t_var = at::empty({M}, options);
   at::Tensor t_N = at::empty({M}, options_int);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   // by default Welford outputs sum of square diff so need to divide to get var
   outputs[1] /= N;
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       outputs,
       {t0},
       {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
@@ -2588,17 +2588,17 @@ TEST_P(WelfordReduction, Test) {
   auto lparams = reduction_params->lparams;
   auto cparams = reduction_params->cparams;
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // Needs to pass compile para to use the correct index type, otherwise the
   // lowering pass will use int64 as the index tpye, since this test saves
   // `tv_N` as index type, it may cause vectorization size validation error. For
   // example, the heuristics set index type to int32 and the max vectorization
-  // factor is 4, if compile para is not passed to compileFusion, the lowering
+  // factor is 4, if compile para is not passed to compile, the lowering
   // pass uses int64 as index type, so the max vectorization factor is 16 bytes
   // sizeof(int64) = 2, which is wrong since the actual index type is int32
   // and the max vectorization factor is 4.
-  fe.compileFusion(&fusion, {aten_input}, lparams, cparams);
-  auto outputs = fe.runFusion({aten_input}, lparams);
+  ke.compile(&fusion, {aten_input}, lparams, cparams);
+  auto outputs = ke.run({aten_input}, lparams);
 
   // by default Welford outputs sum of square diff so need to divide to
   // get var
@@ -2613,7 +2613,7 @@ TEST_P(WelfordReduction, Test) {
   at_n = at_n.sum({axis});
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       outputs,
       {aten_input},
       {at_avg, at_var, at_n},
@@ -2755,12 +2755,12 @@ TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
 
   // Lets specify a few bounds in launch params to make sure it works
   LaunchParams lparams(1, -1, -1, 32, 4, 4);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, lparams);
-  fe.runFusion({t0, t1}, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1}, lparams);
+  ke.run({t0, t1}, lparams);
 
   // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
 
@@ -2820,9 +2820,9 @@ TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
 
   at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_input_t = at::transpose(input, 1, 2);
   auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false);
@@ -2894,9 +2894,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
 
   at::Tensor aten_input = at::randn({129, 127}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   at::Tensor aten_input_t = aten_input.t();
 
@@ -2963,9 +2963,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({129, 127}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   auto input_t = input.t();
   auto t1 = input_t.mul({-1.0});
@@ -3029,9 +3029,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto t0_t = t0.permute({3, 0, 1, 2});
   auto t1_t = t1.permute({3, 0, 1, 2});
@@ -3107,9 +3107,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto t0_t = t0.permute({3, 0, 1, 2});
   auto t1_t = t1.permute({3, 0, 1, 2});
@@ -3155,9 +3155,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto t2 = t0.t().add(2.0);
   auto aten_output = t1.t().mul(t2);
@@ -3197,9 +3197,9 @@ TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto t2 = t0.t().add(2.0);
   auto aten_output = t1.t().mul(t2);
@@ -3348,9 +3348,9 @@ TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) {
 
   at::Tensor aten_input = at::empty({2, 6, 32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   at::Tensor aten_output = aten_input.sin();
 
@@ -3423,9 +3423,9 @@ TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
   at::Tensor input2 = at::rand_like(input1);
   at::Tensor output = at::empty_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  ke.run({input1, input2}, {output});
 
   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;
@@ -3503,9 +3503,9 @@ TEST_F(NVFuserTest, FusionGridPersistence_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto out = ke.run({input});
 
   testValidate(&fusion, out, {input}, __LINE__, __FILE__);
 }
@@ -3536,9 +3536,9 @@ TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto out = ke.run({input});
 
   testValidate(&fusion, out, {input}, __LINE__, __FILE__);
 }
@@ -3570,9 +3570,9 @@ TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto out = ke.run({input});
 
   auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
                          .unsqueeze(-1)
@@ -3610,9 +3610,9 @@ TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto out = ke.run({input});
 
   auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
                          .unsqueeze(0)
@@ -3648,9 +3648,9 @@ TEST_F(NVFuserTest, FusionIssue633_CUDA) {
   at::Tensor t1 = at::randn({dx, dy, 1}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3681,9 +3681,9 @@ TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
   at::Tensor t1 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3730,9 +3730,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3786,9 +3786,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3847,9 +3847,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3908,11 +3908,11 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // TODO: throw assertion - cannot merge non-contiguous vectorization axes
   // Make sure compilation fails
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+  ASSERT_ANY_THROW(ke.compile(&fusion));
 }
 
 TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
@@ -3964,9 +3964,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto aten_output = t0.add(t1).sum(1);
   testValidate(
@@ -4006,10 +4006,10 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
     tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize);
   }
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // Make sure compilation fails
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+  ASSERT_ANY_THROW(ke.compile(&fusion));
 }
 
 TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
@@ -4056,9 +4056,9 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
       at::randn({bx, by}, options).index({"...", at::indexing::Slice(3)});
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4110,12 +4110,12 @@ TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
       at::randn({bx, by}, options).index({"...", at::indexing::Slice(3)});
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   // Failure because the input + output tensors do not have the same stride
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+  ASSERT_ANY_THROW(ke.run(aten_inputs));
 }
 
 TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
@@ -4157,9 +4157,9 @@ TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4197,10 +4197,10 @@ TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
     tv->axis(-2)->parallelize(ParallelType::Vectorize);
   }
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // Make sure compilation fails
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+  ASSERT_ANY_THROW(ke.compile(&fusion));
 }
 
 // TODO: Re-enable once vectorization validation is fixed
@@ -4244,20 +4244,20 @@ TEST_F(NVFuserTest, FusionVectorization3_CUDA) {
   at::Tensor t1 = at::randn({bx, by}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+  ASSERT_ANY_THROW(ke.run(aten_inputs));
 
   aten_inputs[0] = t0.index({"...", at::indexing::Slice(1)});
   aten_inputs[1] = t1.index({"...", at::indexing::Slice(1)});
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+  ASSERT_ANY_THROW(ke.run(aten_inputs));
 
   t0 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)});
   t1 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)});
   aten_inputs = {t0, t1};
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4309,9 +4309,9 @@ TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto aten_output = t0.add(t1).sum(1);
   testValidate(
@@ -4372,9 +4372,9 @@ TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
   at::Tensor t2 = at::randn({z, x, y}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4406,9 +4406,9 @@ TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
   at::Tensor t0 = at::randn({x}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4428,9 +4428,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
   tv2->axis(-1)->parallelize(ParallelType::TIDy);
 
   // Invalid as tv1 and tv2 do have the same ParallelType
-  FusionExecutor fe;
+  KernelExecutor ke;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+  ASSERT_ANY_THROW(ke.compile(&fusion));
 }
 
 TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
@@ -4450,8 +4450,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
 
   // tv1 and tv2 do have the same ParallelType, but tv1 is on shared
   // memory, so it is valid
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 }
 
 TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
@@ -4473,8 +4473,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
   tv1->setMemoryType(MemoryType::Global);
 
   // tv1 and tv2 have the same shape and ParallelType
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 }
 
 TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
@@ -4496,8 +4496,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
   tv1->setMemoryType(MemoryType::Global);
 
   // tv1 and tv2 do not have the same shape but global memory comm is supported.
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 }
 
 TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
@@ -4520,8 +4520,8 @@ TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
 
   // tv1 and tv2 do not have the same shape, but tv1 is on shared
   // memory, so it is valid
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 }
 
 // See issue #995
@@ -4648,9 +4648,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize8_CUDA) {
   at::Tensor input0 = at::arange(64, options).view({32, 2});
   at::Tensor input1 = at::arange(32, options) * 0.01;
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1});
-  auto outputs = fe.runFusion({input0, input1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input0, input1});
+  auto outputs = ke.run({input0, input1});
 
   testValidate(&fusion, outputs, {input0, input1}, __LINE__, __FILE__);
 }
@@ -4737,9 +4737,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize10_CUDA) {
   at::Tensor t1 = at::randn({s0, s1}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4783,9 +4783,9 @@ TEST_F(NVFuserTest, FusionValidateParallelize11_CUDA) {
   at::Tensor t1 = at::randn({s0, s1}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4897,9 +4897,9 @@ TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
   at::Tensor t0 = at::randn({M, N, K}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
 
@@ -4926,9 +4926,9 @@ TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
   at::Tensor t0 = at::randn({M, N, K}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
   at::Tensor aten_avg = t0.mean({1, 2});
   at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K;
   testValidate(
@@ -4965,9 +4965,9 @@ TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) {
   at::Tensor input = at::randn({numel_x, numel_y}, options);
   at::Tensor cg_output = at::empty({numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  ke.run({input}, {cg_output});
 
   auto aten_output = input.to(at::kDouble).sum({0});
 
@@ -5062,9 +5062,9 @@ TEST_F(NVFuserTest, FusionIssue757_CUDA) {
   at::Tensor t3 = at::randn({numel_x, numel_y}, options);
   std::vector<c10::IValue> inputs = {t0, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -5100,9 +5100,9 @@ TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
   at::Tensor t3 = at::randn({numel_x, numel_y}, options);
   std::vector<c10::IValue> inputs = {t0, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -5364,10 +5364,10 @@ TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) {
   at::Tensor input6 = at::randn_like(input0);
   at::Tensor input7 = at::randn_like(input0);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> inputs = {
       input0, input1, input2, input3, input4, input5, input6, input7};
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 }
 
 // TODO: We only changed inputs, merge this with the test above.
@@ -5432,10 +5432,10 @@ TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
   at::Tensor input6 = at::randn_like(input0);
   at::Tensor input7 = at::randn_like(input0);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> inputs = {
       input0, input1, input2, input3, input4, input5, input6, input7};
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 }
 
 TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
@@ -5494,10 +5494,10 @@ TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
   auto input4_ref = input4.clone();
   auto input5_ref = input5.clone();
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> aten_inputs = {
       input1, input2, input3, input4, input5};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   auto at_results = at::native_batch_norm(
       input1_ref,
@@ -5563,9 +5563,9 @@ TEST_F(NVFuserTest, FusionBNRepro2_CUDA) {
   at::Tensor weight;
   at::Tensor bias;
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> aten_inputs = {input1};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -5894,8 +5894,8 @@ TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
   at::Tensor t0 = at::randn({16, 16}, options);
   at::Tensor t1 = at::randn({16, 16}, options);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion));
-  fusion_executor_cache.runFusionWithInputs({t0, t1});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  executor_cache.runFusionWithInputs({t0, t1});
 }
 
 TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
@@ -6004,9 +6004,9 @@ TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) {
 
   auto at_output = input1.sum({1}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
 
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
@@ -6053,9 +6053,9 @@ TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) {
 
   auto at_output = input1.sum({1}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
 }
@@ -6098,9 +6098,9 @@ TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
 
   auto at_output = input1.sum({1, 2}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
 }
@@ -6140,9 +6140,9 @@ TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) {
 
   auto at_output = input1.sum({1, 2}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
 }
@@ -6185,9 +6185,9 @@ TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
 
   auto at_output = input1.sum({1, 2, 3}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
 }
@@ -6240,9 +6240,9 @@ TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) {
 
   auto at_output = input1.sum({1}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1, input2});
+  auto outputs = ke.run({input1, input2});
   testValidate(
       fusion.get(),
       outputs,
@@ -6278,9 +6278,9 @@ TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input1 = at::randn({16, 31}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__);
 }
 
@@ -6313,9 +6313,9 @@ TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
 
   auto at_output = (input1 + 1).sum({1});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
 }
@@ -6364,9 +6364,9 @@ TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
 
   auto at_output = input1.sum({1}, true).add(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
 }
@@ -6410,9 +6410,9 @@ TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) {
   auto t0 = at::randn(shape1, options);
   auto t2 = at::randn(shape2, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t2});
-  auto cg_outputs = fe.runFusion({t0, t2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t2});
+  auto cg_outputs = ke.run({t0, t2});
 
   auto t1 = t0.sum({0});
   auto t4 = (t2 + 1).sum({0}) + 1;
@@ -6458,9 +6458,9 @@ TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) {
   at::Tensor input0 = at::randn({batch, c, h, w}, options);
   at::Tensor input1 = at::randn({batch, c, h, w}, options);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
   std::vector<c10::IValue> inputs = {input0, input1};
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -6491,9 +6491,9 @@ TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
   auto in0 = at::randn({2, 2}, options);
   auto in1 = at::randn({2, 2, 2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-  auto outputs = fe.runFusion({in0, in1});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0, in1});
+  auto outputs = ke.run({in0, in1});
 
   testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__);
 }
@@ -6535,10 +6535,10 @@ TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
   auto in0 = at::randn({2, 2}, options);
   auto in1 = at::randn({2, 2, 2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0, in1});
 
-  auto outputs = fe.runFusion({in0, in1});
+  auto outputs = ke.run({in0, in1});
 
   testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__);
 }
@@ -6567,9 +6567,9 @@ TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({256, 512}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0});
-  auto outputs = fe.runFusion({in0});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0});
+  auto outputs = ke.run({in0});
 
   testValidate(fusion, outputs, {in0}, __LINE__, __FILE__);
 }
@@ -6599,9 +6599,9 @@ TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({2, 2}, options);
   auto in1 = at::randn({2, 2, 2}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-  auto outputs = fe.runFusion({in0, in1});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0, in1});
+  auto outputs = ke.run({in0, in1});
 
   testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__);
 }
@@ -6633,9 +6633,9 @@ TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({3, 3, 3}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0});
-  auto outputs = fe.runFusion({in0});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0});
+  auto outputs = ke.run({in0});
 
   testValidate(fusion, outputs, {in0}, __LINE__, __FILE__);
 }
@@ -6662,9 +6662,9 @@ TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({16, 16}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0});
-  auto cg_outputs = fe.runFusion({in0});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0});
+  auto cg_outputs = ke.run({in0});
 
   testValidate(fusion, cg_outputs, {in0}, __LINE__, __FILE__);
 }
@@ -6696,9 +6696,9 @@ TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({2, 2}, options);
   auto in1 = at::randn({2, 2, 2}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-  auto outputs = fe.runFusion({in0, in1});
+  KernelExecutor ke;
+  ke.compile(fusion, {in0, in1});
+  auto outputs = ke.run({in0, in1});
 
   testValidate(fusion, outputs, {in0, in1}, __LINE__, __FILE__);
 }
@@ -6722,9 +6722,9 @@ TEST_F(NVFuserTest, FusionIssue970_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({nelm, nelm}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__);
 }
@@ -6753,9 +6753,9 @@ TEST_F(NVFuserTest, FusionIssue1016_CUDA) {
   at::Tensor t0 = at::randn({numel_x, numel_y}, options);
   std::vector<c10::IValue> inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, {t0}, __LINE__, __FILE__);
 }
@@ -6784,9 +6784,9 @@ TEST_F(NVFuserTest, FusionIssue1021_CUDA) {
   at::Tensor t0 = at::randn({10}, options);
   std::vector<c10::IValue> inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -6819,9 +6819,9 @@ TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
   auto at_tv1 = (input1).sum({0});
   auto at_tv2 = input1 + 1;
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
   testValidate(
       fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
 }
@@ -6856,9 +6856,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input1 = at::randn({32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
 
   testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__);
 }
@@ -6893,9 +6893,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
   at::Tensor input1 = at::randn({11}, options);
   at::Tensor input2 = at::randn({11, 13}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1, input2});
+  auto outputs = ke.run({input1, input2});
 
   testValidate(fusion.get(), outputs, {input1, input2}, __LINE__, __FILE__);
 }
@@ -6941,9 +6941,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input1 = at::randn({13}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {input1});
+  auto outputs = ke.run({input1});
 
   testValidate(fusion.get(), outputs, {input1}, __LINE__, __FILE__);
 }
@@ -6987,9 +6987,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
   at::Tensor input1 = at::randn({13}, options);
   at::Tensor input2 = at::randn({15, 13}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  auto outputs = ke.run({input1, input2});
 
   testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__);
 }
@@ -7031,9 +7031,9 @@ TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
   at::Tensor input1 = at::randn({13}, options);
   at::Tensor input2 = at::randn({13, 15}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  auto outputs = ke.run({input1, input2});
 
   testValidate(&fusion, outputs, {input1, input2}, __LINE__, __FILE__);
 }
@@ -7185,9 +7185,9 @@ TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
   at::Tensor t0 = at::randn({nx}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7238,9 +7238,9 @@ TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
   at::Tensor t0 = at::randn({17}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7289,9 +7289,9 @@ TEST_F(NVFuserTest, FusionIssue1099_CUDA) {
   at::Tensor t3 = at::randn({19}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7331,9 +7331,9 @@ TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) {
   at::Tensor t0 = at::randn({nx, ny}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7370,9 +7370,9 @@ TEST_F(NVFuserTest, FusionIssue1189_CUDA) {
   at::Tensor t0 = at::randn({16, 16, 1}, options);
   at::Tensor t1 = at::randn({16, 16, 1}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto outputs = ke.run({t0, t1});
 
   testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -7403,9 +7403,9 @@ TEST_F(NVFuserTest, FusionIssue1052_CUDA) {
   at::Tensor t1 = at::randn({100}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7509,9 +7509,9 @@ TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) {
   at::Tensor t4 = at::randn({1024}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t4};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7539,9 +7539,9 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
   at::Tensor t2 = at::randn({19}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7569,9 +7569,9 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
   at::Tensor t2 = at::randn({19}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto ref1 = t0 + 1;
   auto ref2 = mean(t2, {0});
@@ -7610,15 +7610,15 @@ TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
   tv5->axis(1)->parallelize(ParallelType::BIDy);
   tv5->axis(2)->parallelize(ParallelType::BIDz);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({2, 3}, options);
   at::Tensor t2 = at::randn({5, 6, 7}, options);
   at::Tensor t4 = at::randn({8, 9, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t2, t4};
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7654,15 +7654,15 @@ TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
   tv5->axis(1)->parallelize(ParallelType::BIDy);
   tv5->axis(2)->parallelize(ParallelType::BIDz);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({2, 3}, options);
   at::Tensor t2 = at::randn({5, 6, 7}, options);
   at::Tensor t4 = at::randn({8, 9, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t2, t4};
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto ref1 = t0.mean(at::IntArrayRef{0, 1});
   auto ref2 = t2 + 1;
@@ -7723,9 +7723,9 @@ TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
   at::Tensor t4 = at::randn({19}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t4};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto ref1 = t0 + 3;
   auto ref2 = sum(t4 + 4);
@@ -7785,9 +7785,9 @@ TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
   at::Tensor t1 = at::randn({19}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7834,9 +7834,9 @@ TEST_F(NVFuserTest, FusionFloatPow_CUDA) {
   t0 = abs(t0);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto p4 = at::pow(t0, 4);
   auto p2 = at::pow(t0, 2);
@@ -7903,9 +7903,9 @@ TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
   at::Tensor t0 = at::randn({10, 1024}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -7926,9 +7926,9 @@ TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) {
   at::Tensor at_input = at::randn({10}, options);
   at::Tensor at_output = at::empty_strided({10}, {2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_input});
-  auto returned_outputs = fe.runFusion({at_input}, {at_output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_input});
+  auto returned_outputs = ke.run({at_input}, {at_output});
 
   // Returned outputs should only contain one tensor that is the same
   // as the output tensor given to runFusion
@@ -7974,9 +7974,9 @@ TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
   }
 
   // Test result
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
   auto ref_output = at::_softmax(aten_input, 1, false);
   testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
 }
@@ -8048,9 +8048,9 @@ TEST_F(NVFuserTest, FusionIssue1133_CUDA) {
   at::Tensor t0 = at::randn({99, 101}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto ref = (t0 + 1).sum({1}) + 1;
 
@@ -8082,9 +8082,9 @@ TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) {
   at::Tensor t0 = at::randn({99, 101}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto ref = t0.sum({1});
 
@@ -8137,9 +8137,9 @@ TEST_F(NVFuserTest, FusionIssue1223_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at_t0 = at::ones({11, 10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0});
-  auto cg_outputs = fe.runFusion({at_t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_t0});
+  auto cg_outputs = ke.run({at_t0});
 
   auto at_t1 = (at_t0 + 1).sum();
 
@@ -8181,9 +8181,9 @@ TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) {
   at_t0 = at::abs(at_t0);
   at::Tensor at_t3 = at::randn({128}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t3});
-  auto cg_outputs = fe.runFusion({at_t0, at_t3});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_t0, at_t3});
+  auto cg_outputs = ke.run({at_t0, at_t3});
 
   auto at_t2 = (at_t0 + 1).min();
   auto at_t4 = at_t3 + 1;
@@ -8233,9 +8233,9 @@ TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) {
   at_t0 = at::abs(at_t0);
   at::Tensor at_t3 = at::randn({128}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t3});
-  auto cg_outputs = fe.runFusion({at_t0, at_t3});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_t0, at_t3});
+  auto cg_outputs = ke.run({at_t0, at_t3});
 
   auto at_t2 = std::get<0>(at_t0.min(0));
   auto at_t4 = at_t3 + 1;
@@ -8270,9 +8270,9 @@ TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) {
   auto at_in = at::randn({6, 6, 6}, options);
   auto at_out = at_in.sum({1, 2});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_in});
-  auto cg_outputs = fe.runFusion({at_in});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_in});
+  auto cg_outputs = ke.run({at_in});
 
   testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index 9862dcb8b07..f7deca99425 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -107,9 +107,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({24}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = t0.sum();
 
@@ -161,9 +161,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({13, 17}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -210,9 +210,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({24}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum();
 
@@ -260,9 +260,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({24, 2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum();
 
@@ -314,9 +314,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({24}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum();
 
@@ -357,9 +357,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 
@@ -367,7 +367,7 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
   // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
   // illegal. The run-time validation of vectorization should throw an error.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
+  ASSERT_ANY_THROW(ke.run({t0_non_divisible}));
 }
 
 // If a split is validated at run time, it's not necessary to predicate.
@@ -412,9 +412,9 @@ TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
 
   auto t0 = at::randn({1024}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum();
 
@@ -474,16 +474,16 @@ TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
 
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({15}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
+    KernelExecutor ke;
+    ke.compile(&fusion);
 
     // This should throw an exception as the extent of t0 is not
     // divisible by the vector width
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
+    ASSERT_ANY_THROW(ke.run({t0}));
 
     auto t1 = at::randn({16}, options);
-    auto cg_outputs = fe.runFusion({t1});
+    auto cg_outputs = ke.run({t1});
 
     testValidate(&fusion, cg_outputs, {t1}, __LINE__, __FILE__);
   }
@@ -529,9 +529,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) {
   auto t2 = at::randn({10, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -572,9 +572,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) {
   auto t0 = at::randn({10, 11}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1);
 
@@ -617,9 +617,9 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) {
   auto t0 = at::randn(input_shape, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -790,9 +790,9 @@ TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({V, W, X, Y, Z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1));
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto cg_outputs = ke.run({t0}, LaunchParams(X, V, -1, Y, -1, -1));
 
   auto t0_double = t0.to(at::kDouble);
 
@@ -944,9 +944,9 @@ TEST_F(NVFuserTest, FusionTestGridComm_CUDA) {
   auto t0 = at::randn({X, Y, Z}, options);
   auto t1 = at::randn({X, Y, Z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -988,9 +988,9 @@ TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) {
   auto t0 = at::randn({X}, options);
   auto t1 = at::randn({W, X}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1021,9 +1021,9 @@ TEST_F(NVFuserTest, FusionLargeSmem_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   auto t0 = at::randn({(int)(12288 * 4)}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   auto ref = t0 + 1 + 2;
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
@@ -1057,10 +1057,10 @@ TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   auto t0 = at::randn({(int)(12288 * 4)}, options);
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   //  NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0}));
+  ASSERT_ANY_THROW(ke.compile(&fusion, {t0}));
 }
 
 // Try to test alignment when multiple tensors are
@@ -1097,10 +1097,10 @@ TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   auto t0 = at::randn({3, 4, 7, 2, 5}, options);
-  FusionExecutor fe;
+  KernelExecutor ke;
 
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -1126,8 +1126,8 @@ TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) {
   fusion.addOutput(tv1);
 
   // Make sure the kernel is compiled.
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 }
 
 // Repro of #1506
@@ -1157,9 +1157,9 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   NVF_CHECK(t0.equal(cg_outputs[0]));
 }
@@ -1192,10 +1192,10 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // This should fail at compile time as we're trying to merge in a
   // non-contiguous dimension, then split and vectorize it.
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0}));
+  ASSERT_ANY_THROW(ke.compile(&fusion, {t0}));
 }
 
 // Make sure the same fusion as FusionVectorizeContigIndex fails if
@@ -1227,14 +1227,14 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexFail2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // This should fail at the launch time as 14 is not divisible by the
   // vector word size. The two domains are merged, but they are not
   // contiguous, so contig indexing is not involved in this case.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}));
+  ASSERT_ANY_THROW(ke.run({t0}));
 }
 
 TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
@@ -1260,18 +1260,18 @@ TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
   auto t1_misaligned =
       at::empty({n + 1}, options).index({at::indexing::Slice(1)});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   NVF_CHECK(t0.equal(cg_outputs[0]));
 
   // Pass misaligned input. This must fail.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0_misaligned}));
+  ASSERT_ANY_THROW(ke.run({t0_misaligned}));
 
   // Pass misaligned output. This must fail too.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned}));
+  ASSERT_ANY_THROW(ke.run({t0}, {t1_misaligned}));
 }
 
 // Repro of issue #1530
@@ -1300,11 +1300,11 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}));
+  ASSERT_ANY_THROW(ke.run({t0}));
 }
 
 TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
@@ -1331,9 +1331,9 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
   auto t1 = at::randn({3, 4}, options);
 
   {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0, t1});
-    auto cg_outputs = fe.runFusion({t0, t1});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0, t1});
+    auto cg_outputs = ke.run({t0, t1});
 
     testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
   }
@@ -1341,9 +1341,9 @@ TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
   // Make sure tv2 indexing also works when it's stored in global memory
   tv2->setMemoryType(MemoryType::Global);
   {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0, t1});
-    auto cg_outputs = fe.runFusion({t0, t1});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0, t1});
+    auto cg_outputs = ke.run({t0, t1});
 
     testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
   }
@@ -1384,12 +1384,12 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) {
   auto t0 = at::randn(shape1, options);
   auto t1 = at::randn(shape2, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
 
   // Vectorization of tv2 should be detected as invalid.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0, t1}));
+  ASSERT_ANY_THROW(ke.run({t0, t1}));
 }
 
 TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
@@ -1433,9 +1433,9 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
   auto t0 = at::randn(shape1, options);
   auto t1 = at::randn(shape2, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1467,7 +1467,7 @@ TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
   // vector word size should be 4. Broadcasting of tv1 should not
   // matter.
   for (const auto& vec_info :
-       cg_results.fusion_executor->kernel()->summary().vectorized_set_info) {
+       cg_results.kernel_executor->kernel()->summary().vectorized_set_info) {
     NVF_CHECK(
         vec_info.word_size == 4,
         "Invalid vector word size: ",
@@ -1512,9 +1512,9 @@ TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) {
   auto t0 = at::randn({111}, options);
   auto t1 = at::randn({123, 111}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   auto t2 = t0.unsqueeze(0);
   auto t3 = t1 + t2;
@@ -1563,9 +1563,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) {
   auto t0 = at::randn({10, 64}, options);
   auto t1 = at::randn({10, 64}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1608,9 +1608,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) {
   auto t0 = at::randn({10, 64}, options);
   auto t1 = at::randn({10, 64}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1651,9 +1651,9 @@ TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) {
   auto t0 = at::randn({50, 64}, options);
   auto t1 = at::randn({50, 64}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1756,9 +1756,9 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) {
   at::Tensor t1 = at::randn({128, 6}, options);
   at::Tensor t2 = at::randn({128, 6}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t2});
-  auto cg_outputs = fe.runFusion({t0, t1, t2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1, t2});
+  auto cg_outputs = ke.run({t0, t1, t2});
 
   testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
 }
@@ -1796,9 +1796,9 @@ TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) {
   at::Tensor t1 = at::randn({128, 6}, options);
   at::Tensor t2 = at::randn({128, 6}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t2});
-  auto cg_outputs = fe.runFusion({t0, t1, t2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1, t2});
+  auto cg_outputs = ke.run({t0, t1, t2});
 
   testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
 }
@@ -1831,20 +1831,20 @@ TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) {
   at::Tensor t0 = at::randn({m, n}, options);
   at::Tensor t1 = at::randn({m, n}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   // requires ampere+ GPU
   if (!deviceMajorMinorCheck(8)) {
     ASSERT_THAT(
-        [&]() { fe.compileFusion(&fusion, {t0, t1}); },
+        [&]() { ke.compile(&fusion, {t0, t1}); },
         testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
             "Reason: LoadStoreOpType::CpAsync requires Ampere")));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   } else {
-    fe.compileFusion(&fusion, {t0, t1});
+    ke.compile(&fusion, {t0, t1});
   }
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -1877,19 +1877,19 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({m, n}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   if (!deviceMajorMinorCheck(8)) {
     ASSERT_THAT(
-        [&]() { fe.compileFusion(&fusion, {t0}); },
+        [&]() { ke.compile(&fusion, {t0}); },
         testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
             "Reason: LoadStoreOpType::CpAsync requires Ampere")));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   } else {
-    fe.compileFusion(&fusion, {t0});
+    ke.compile(&fusion, {t0});
   }
 
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = t0.sum({1});
 
@@ -2006,11 +2006,11 @@ TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({9999}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
-  testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
 }
 
 // Test ExactLogicalDomainMap
@@ -2211,13 +2211,13 @@ TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) {
   GpuLower gpulw(&fusion);
   checker.handle(gpulw.run()->topLevelExprs());
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {}, LaunchParams());
+  KernelExecutor ke;
+  ke.compile(&fusion, {}, LaunchParams());
 
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({X, Y, Y, Z}, options);
 
-  auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1));
+  auto cg_outputs = ke.run({t0}, LaunchParams(-1, -1, -1, -1, -1, -1));
 
   // by default Welford outputs sum of square diff so need to divide to get var
   cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y));
@@ -2280,9 +2280,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) {
   at::Tensor t0 = at::randn({32}, options);
   at::Tensor t1 = at::randn({32, 32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -2345,9 +2345,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) {
   at::Tensor t0 = at::randn({32}, options);
   at::Tensor t1 = at::randn({32, 32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -2427,9 +2427,9 @@ TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) {
   at::Tensor t0 = at::randn({32}, options);
   at::Tensor t1 = at::randn({32, 32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -2532,9 +2532,9 @@ TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
   at::Tensor t0 = at::randn({10, 11}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -2567,9 +2567,9 @@ TEST_F(NVFuserTest, FusionSqueeze1_CUDA) {
   at::Tensor t0 = at::randn({10, 11}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -2596,11 +2596,11 @@ TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({3, 4}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
-  testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 // Repro of https://github.com/csarofeen/pytorch/issues/1777
@@ -2620,9 +2620,9 @@ TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) {
   auto aten_output = at::div(
       at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__);
 }
@@ -3242,9 +3242,9 @@ TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) {
   at::Tensor in1 = at::randn({16}, options);
   at::Tensor in2 = at::randn({12, 16}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {in1, in2});
-  auto cg_outputs = fe.runFusion({in1, in2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {in1, in2});
+  auto cg_outputs = ke.run({in1, in2});
 
   testValidate(&fusion, cg_outputs, {in1, in2}, __LINE__, __FILE__);
 }
@@ -3516,9 +3516,9 @@ TEST_F(NVFuserTest, FusionVectorComponentReduce_CUDA) {
       at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1024}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__, "");
 }
@@ -3799,9 +3799,9 @@ TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({5, 5}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   auto out = cg_outputs[0];
 
   testValidate(fusion, {out}, {t0}, __LINE__, __FILE__);
@@ -3879,9 +3879,9 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
   at::Tensor t0 = at::randn({1, 1}, options);
   at::Tensor t1 = at::randn({10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto out = cg_outputs[0];
 
   testValidate(
@@ -3923,9 +3923,9 @@ TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
   at::Tensor t0 = at::randn({1, 1}, options);
   at::Tensor t1 = at::randn({2, 1, 1}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto out = cg_outputs[0];
 
   testValidate(fusion, {out}, {t0, t1}, __LINE__, __FILE__);
@@ -3947,9 +3947,9 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({100, 2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   auto out = cg_outputs[0];
 
   testValidate(fusion, {out}, {t0}, __LINE__, __FILE__);
@@ -3981,9 +3981,9 @@ TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) {
   at::Tensor t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4043,19 +4043,19 @@ TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
   GpuLower gpulw(&fusion);
   pred_checker.handle(gpulw.run()->topLevelExprs());
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   // requires ampere+ GPU
   if (!deviceMajorMinorCheck(8)) {
     ASSERT_THAT(
-        [&]() { fe.compileFusion(&fusion, {input1}); },
+        [&]() { ke.compile(&fusion, {input1}); },
         testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
             "Reason: LoadStoreOpType::CpAsync requires Ampere")));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   } else {
-    fe.compileFusion(&fusion, {input1});
+    ke.compile(&fusion, {input1});
   }
 
-  auto cg_outputs = fe.runFusion({input1});
+  auto cg_outputs = ke.run({input1});
 
   testValidate(&fusion, cg_outputs, {input1}, __LINE__, __FILE__);
 }
@@ -4078,8 +4078,8 @@ TEST_F(NVFuserTest, FusionExpandedInput_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({4096, 1, 4}, options).expand({-1, 7, -1});
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({t0});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
 
   testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -4112,8 +4112,8 @@ TEST_F(NVFuserTest, FusionVectorizeRepro1843_CUDA) {
   at::Tensor t0 =
       at::empty_strided({4096, 32128}, {32128, 1}, options).random_();
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({t1, t0});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t1, t0});
 
   testValidate(fusion, cg_outputs, {t1, t0}, __LINE__, __FILE__);
 }
@@ -4137,8 +4137,8 @@ TEST_F(NVFuserTest, FusionBroadcastPersistentReduction_CUDA) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   auto t0 = at::randn({1024, 768}, options);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({t0});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
   testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
@@ -4284,8 +4284,8 @@ TEST_F(NVFuserTest, FusionRepro2094_CUDA) {
     outputs.push_back(t32);
   }
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__);
 }
 
@@ -4428,9 +4428,9 @@ TEST_F(NVFuserTest, FusionSqueezeTransformPropagation_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({5, 1, 1, 1, 1}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -4482,9 +4482,9 @@ TEST_F(NVFuserTest, FusionSqueezeInlining_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({1, 1024}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -4885,9 +4885,9 @@ TEST_F(NVFuserTest, FusionPropagateVectorizePredicate_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({32}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   NVF_CHECK(t0.equal(cg_outputs[0]));
 }
@@ -4992,16 +4992,16 @@ TEST_F(NVFuserTest, FusionIssue2163ReproInvalidAlias_CUDA) {
 
   std::vector<c10::IValue> aten_inputs({at_input, at_weight});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
   auto cg_output = cg_outputs.at(0);
 
   auto ref_x_sub_mean = at_input - at_input.sum({0}).unsqueeze(0);
   auto ref_y = ref_x_sub_mean * at_weight.unsqueeze(0);
 
   testValidate(
-      fe.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, "");
+      ke.kernel(), {cg_output}, aten_inputs, {ref_y}, __LINE__, __FILE__, "");
 }
 
 // Testing scalar FP types
@@ -5080,9 +5080,9 @@ TEST_F(NVFuserTest, FusionFloatingPointType_CUDA) {
 
   std::vector<c10::IValue> inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__);
 }
@@ -5146,9 +5146,9 @@ TEST_F(NVFuserTest, FusionIntegerType_CUDA) {
 
   std::vector<c10::IValue> inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   auto i2 = int64_val;
   auto i3 = int_val;
@@ -5209,16 +5209,16 @@ TEST_F(NVFuserTest, FusionVectorizeWelford1_CUDA) {
 
   at::Tensor t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref_avg = t0.mean({0});
   auto ref_var = t0.var({0}, false) * shape[0];
   auto ref_N = at::ones({shape[1]}, options_int) * shape[0];
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       cg_outputs,
       {t0},
       {ref_avg, ref_var, ref_N},
@@ -5282,16 +5282,16 @@ TEST_F(NVFuserTest, FusionVectorizeWelford2_CUDA) {
 
   at::Tensor t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref_avg = t0.to(at::kDouble).mean({0});
   auto ref_var = t0.to(at::kDouble).var({0}, false) * shape[0];
   auto ref_N = at::ones({shape[1]}, options_int) * shape[0];
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       cg_outputs,
       {t0},
       {ref_avg, ref_var, ref_N},
@@ -5320,7 +5320,7 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) {
     fusion->addOutput(t7);
   }
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().device(at::kCUDA, 0);
   at::Tensor t6 = at::tensor({15}, options.dtype(at::kLong));
@@ -5328,7 +5328,7 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) {
   at::Tensor t20 =
       at::tensor({12}, options.dtype(at::kLong)).expand({1, 1, 1, 1});
 
-  auto cg_outputs = fec.runFusionWithInputs({t6, t15, t20});
+  auto cg_outputs = executor_cache.runFusionWithInputs({t6, t15, t20});
 
   auto sample_total = at::sum(t15, {0, 1, 2, 3}, true);
   auto sample_mean = at::div(sample_total, t20);
@@ -5338,7 +5338,12 @@ TEST_F(NVFuserTest, FusionRepro2241_CUDA) {
   auto t7 = at::div(total, t6);
 
   testValidate(
-      fec.fusion(), cg_outputs, {t6, t15, t20}, {t7}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      cg_outputs,
+      {t6, t15, t20},
+      {t7},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) {
@@ -5379,11 +5384,11 @@ TEST_F(NVFuserTest, FusionExprSortMatmulLikeSchedule_CUDA) {
   at::Tensor t0 = at::randn({M1, M2, K1, K2}, options);
   at::Tensor t1 = at::randn({N1, N2, K1, K2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
-  testValidate(fe.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionFloatConstantWhere_CUDA) {
@@ -5439,19 +5444,19 @@ TEST_F(NVFuserTest, FusionCpAsyncCommitWait_CUDA) {
 
   at::Tensor t0 = at::randn({12800, 8, 8, 8}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   if (!deviceMajorMinorCheck(8)) {
     ASSERT_THAT(
-        [&]() { fe.compileFusion(&fusion, {t0}); },
+        [&]() { ke.compile(&fusion, {t0}); },
         testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
             "Reason: LoadStoreOpType::CpAsync requires Ampere")));
     GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
   } else {
-    fe.compileFusion(&fusion, {t0});
+    ke.compile(&fusion, {t0});
   }
 
-  auto cg_outputs = fe.runFusion({t0});
-  testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
+  auto cg_outputs = ke.run({t0});
+  testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 // Repro of issue #2459
@@ -5514,14 +5519,14 @@ TEST_F(NVFuserTest, FusionClearThreadPredicateByRAWSync_CUDA) {
 
   std::vector<c10::IValue> inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   auto t3 = t0.sum({1}).sum({0});
   auto t6 = t0.sum({1});
 
-  testValidate(fe.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), cg_outputs, inputs, {t3, t6}, __LINE__, __FILE__);
 }
 
 namespace {
@@ -5636,15 +5641,15 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitShared_CUDA) {
 
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   auto ref_t1 = t0.sum({0});
   auto ref_t4 = t1.exp();
 
   testValidate(
-      fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__);
+      ke.kernel(), cg_outputs, inputs, {ref_t1, ref_t4}, __LINE__, __FILE__);
 }
 
 // Repro of issue #2487
@@ -5690,15 +5695,15 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
 
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   auto ref_t1 = t0.sum({0});
   auto ref_t3 = t1.exp();
 
   testValidate(
-      fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
+      ke.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionTypePromotionATenConsistency_CUDA) {
@@ -5763,74 +5768,71 @@ TEST_F(NVFuserTest, FusionCompileIndexType_CUDA) {
             .getSmallestIndexTypeOfArguments() == PrimDataType::Int32);
 
     {
-      FusionExecutor fe;
+      KernelExecutor ke;
       // Lower the kernel with large inputs and int64 index type.
       CompileParams compile_opts = {.index_type = PrimDataType::Int};
-      fe.compileFusion(&fusion, large_inputs, LaunchParams(), compile_opts);
+      ke.compile(&fusion, large_inputs, LaunchParams(), compile_opts);
 
       NVF_CHECK(
-          fe.kernel()->indexType() == PrimDataType::Int,
+          ke.kernel()->indexType() == PrimDataType::Int,
           "Unexpected kernel index type: ",
-          fe.kernel()->indexType());
+          ke.kernel()->indexType());
 
       // Since the index type is int64, both small and large inputs
       // should work fine
-      fe.runFusion(small_inputs);
-      fe.runFusion(large_inputs);
+      ke.run(small_inputs);
+      ke.run(large_inputs);
     }
 
     {
-      FusionExecutor fe;
+      KernelExecutor ke;
       // Lower the kernel with small inputs and int64 index type.
       CompileParams compile_opts = {.index_type = PrimDataType::Int};
-      fe.compileFusion(&fusion, small_inputs, LaunchParams(), compile_opts);
+      ke.compile(&fusion, small_inputs, LaunchParams(), compile_opts);
 
       NVF_CHECK(
-          fe.kernel()->indexType() == PrimDataType::Int,
+          ke.kernel()->indexType() == PrimDataType::Int,
           "Unexpected kernel index type: ",
-          fe.kernel()->indexType());
+          ke.kernel()->indexType());
 
       // Since the index type is int64, both small and large inputs
       // should work fine
-      fe.runFusion(small_inputs);
-      fe.runFusion(large_inputs);
+      ke.run(small_inputs);
+      ke.run(large_inputs);
     }
 
     {
-      FusionExecutor fe;
+      KernelExecutor ke;
       LaunchParams launch_params;
       CompileParams compile_opts = {.index_type = PrimDataType::Int32};
-      fe.compileFusion(&fusion, small_inputs, launch_params, compile_opts);
+      ke.compile(&fusion, small_inputs, launch_params, compile_opts);
 
       NVF_CHECK(
-          fe.kernel()->indexType() == PrimDataType::Int32,
+          ke.kernel()->indexType() == PrimDataType::Int32,
           "Unexpected kernel index type: ",
-          fe.kernel()->indexType());
+          ke.kernel()->indexType());
 
       // This should complete successfully as the arguments are small
       // enough to use the int32 index type
-      fe.runFusion(small_inputs);
+      ke.run(small_inputs);
 
       // This should fail as the Kernel is already compiled for Int32, but
       // the arguments are too large
       CompileParams compile_opts_large = {.index_type = PrimDataType::Int};
       EXPECT_THAT(
-          [&]() {
-            fe.runFusion(large_inputs, launch_params, compile_opts_large);
-          },
+          [&]() { ke.run(large_inputs, launch_params, compile_opts_large); },
           testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
               "Kernel index type and compilation index type don't match")));
     }
 
     {
-      FusionExecutor fe;
+      KernelExecutor ke;
       // Lower the kernel with large inputs and int32 index type.
       CompileParams compile_opts = {.index_type = PrimDataType::Int32};
       // This should fail due to the conflict
       EXPECT_THAT(
           [&]() {
-            fe.compileFusion(
-                &fusion, large_inputs, LaunchParams(), compile_opts);
+            ke.compile(&fusion, large_inputs, LaunchParams(), compile_opts);
           },
           testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
               "Compilation with int32 is requested but int64 is required for the arguments")));
@@ -6034,13 +6036,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) {
   at::Tensor t1 = at::ones(shape1, options);
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
 
   // check thread_pred and write_stride
-  const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0);
-  auto kernel = fe.kernel();
-  const auto& thread_pred_map = fe.threadPredMap();
+  const auto& ke =
+      executor_cache.getMostRecentKernelRuntime()->executors().at(0);
+  auto kernel = ke.kernel();
+  const auto& thread_pred_map = ke.threadPredMap();
   for (const auto expr : kernel->exprs()) {
     auto tv = ir_utils::getTvOutput(expr);
     if (tv && tv->name() == 15 && tv->getMemoryType() == MemoryType::Global) {
@@ -6054,7 +6057,7 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteBroadcastedSoftmaxInput_CUDA) {
     }
   }
 
-  testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) {
@@ -6089,13 +6092,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) {
     at::Tensor t1 = at::randn(shape1, options);
     std::vector<c10::IValue> inputs = {t0, t1};
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
-    auto cg_outputs = fec.runFusionWithInputs(inputs);
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
 
     // check thread_pred and write_stride
-    const auto& fe = fec.getMostRecentKernelRuntime()->executors().at(0);
-    auto kernel = fe.kernel();
-    const auto& thread_pred_map = fe.threadPredMap();
+    const auto& ke =
+        executor_cache.getMostRecentKernelRuntime()->executors().at(0);
+    auto kernel = ke.kernel();
+    const auto& thread_pred_map = ke.threadPredMap();
 
     for (const auto expr : kernel->exprs()) {
       auto tv = ir_utils::getTvOutput(expr);
@@ -6110,7 +6114,8 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWrite_CUDA) {
       }
     }
 
-    testValidate(fec.fusion(), cg_outputs, inputs, __LINE__, __FILE__);
+    testValidate(
+        executor_cache.fusion(), cg_outputs, inputs, __LINE__, __FILE__);
   };
 
   // Test case where [B1,I2,I3] is merged to [B1I2I3]
@@ -6189,13 +6194,14 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteDifferentConcretizedDomains_CUDA) {
           testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
               "Producer is required to be in Global Memory based on parallelization strategy. RAW flags: (blockIdx.x)")));
     } else {
-      FusionExecutorCache fec(std::move(fusion_ptr));
-      auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+      FusionExecutorCache executor_cache(std::move(fusion_ptr));
+      auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-      auto optimized_fusion = fec.getMostRecentKernelRuntime();
+      auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
       NVF_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen!");
 
-      testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
+      testValidate(
+          executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
     }
   };
   runTest(true);
@@ -6239,13 +6245,13 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonOutput_CUDA) {
   at::Tensor t1 = at::randn({32, 64}, options);
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), inputs);
+  auto cg_outputs = ke.run(inputs);
 
   // check thread_pred
-  auto kernel = fe.kernel();
-  const auto& thread_pred_map = fe.threadPredMap();
+  auto kernel = ke.kernel();
+  const auto& thread_pred_map = ke.threadPredMap();
 
   for (const auto expr : kernel->exprs()) {
     auto tv = ir_utils::getTvOutput(expr);
@@ -6303,13 +6309,13 @@ TEST_F(NVFuserTest, FusionAvoidRedundantWriteNonNeighbor_CUDA) {
   at::Tensor t1 = at::randn({8, 7, 10, 12, 9}, options);
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), inputs);
+  auto cg_outputs = ke.run(inputs);
 
   // check thread_pred
-  auto kernel = fe.kernel();
-  const auto& thread_pred_map = fe.threadPredMap();
+  auto kernel = ke.kernel();
+  const auto& thread_pred_map = ke.threadPredMap();
 
   for (const auto expr : kernel->exprs()) {
     auto tv = ir_utils::getTvOutput(expr);
@@ -6759,9 +6765,9 @@ TEST_F(ExpandedBroadcastGlobalIntermediateTest, TheTest_CUDA) {
 
   at::Tensor t0 = at::randn({2, 1, 2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), {t0});
-  auto cg_output = fe.runFusion({t0}).at(0);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), {t0});
+  auto cg_output = ke.run({t0}).at(0);
 
   ASSERT_EQ(cg_output.size(0), 2);
   ASSERT_EQ(cg_output.size(1), (1L << 60L));
@@ -6808,10 +6814,9 @@ TEST_F(NVFuserTest, FusionTestWarnRegisterSpill_CUDA) {
     auto compile_opts = heuristic_params->cparams;
     compile_opts.maxrregcount = 32;
     compile_opts.enable_ptxas_verbose = true;
-    FusionExecutor fe;
-    fe.compileFusion(
-        &fusion, {aten_input}, heuristic_params->lparams, compile_opts);
-    auto cg_outputs = fe.runFusion({aten_input});
+    KernelExecutor ke;
+    ke.compile(&fusion, {aten_input}, heuristic_params->lparams, compile_opts);
+    auto cg_outputs = ke.run({aten_input});
 
     // validate results
     testValidate(
@@ -6926,9 +6931,9 @@ TEST_F(NVFuserTest, IsFinite_CUDA) {
     std::array<float, 3> data{1.0, INFINITY, NAN};
     const auto input = at::from_blob(data.data(), {3}, {1}).to(options);
 
-    FusionExecutor fe;
-    fe.compileFusion(fusion, {input});
-    const auto output = fe.runFusion({input});
+    KernelExecutor ke;
+    ke.compile(fusion, {input});
+    const auto output = ke.run({input});
 
     testValidate(fusion, output, {input}, __LINE__, __FILE__);
   }
@@ -7026,8 +7031,8 @@ TEST_F(NVFuserTest, FusionOptionsGuard_CUDA) {
   // capture stdout and check stdout contains register spill warning
   captureStdout();
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion,
       {aten_input},
       heuristic_params->lparams,
@@ -7070,18 +7075,18 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) {
   auto tv1 = add(tv0, tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto a5 = at::zeros({5}, options);
   auto a6 = at::zeros({6}, options);
   auto a7 = at::zeros({7}, options);
 
-  fec.runFusionWithInputs({a5});
+  executor_cache.runFusionWithInputs({a5});
 
-  auto numRuntimes = [&fec]() -> size_t {
+  auto numRuntimes = [&executor_cache]() -> size_t {
     // this is map<pair<device, conc_info>, vector<FusionKernelRuntime>>
-    const auto& runtime_map = fec.getKernelRuntimes();
+    const auto& runtime_map = executor_cache.getKernelRuntimes();
     return runtime_map
         .begin() // There should be only one device/concretization pair
         ->second.size();
@@ -7091,7 +7096,7 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) {
     DisableOptionsGuard og;
     DisableOptionsGuard::getCurOptions().unset(DisableOption::KernelReuse);
 
-    fec.runFusionWithInputs({a6});
+    executor_cache.runFusionWithInputs({a6});
 
     // Since kernel reuse is enabled, we should not generate a new runtime
     EXPECT_EQ(numRuntimes(), 1);
@@ -7101,7 +7106,7 @@ TEST_F(NVFuserTest, FusionDisableKernelReuse_CUDA) {
     DisableOptionsGuard og;
     DisableOptionsGuard::getCurOptions().set(DisableOption::KernelReuse);
 
-    fec.runFusionWithInputs({a7});
+    executor_cache.runFusionWithInputs({a7});
 
     // Disabling reuse means we should get a new runtime
     EXPECT_EQ(numRuntimes(), 2);
@@ -7186,9 +7191,9 @@ TEST_F(NVFuserTest, FusionLayerNormSharedMemoryBuffer_CUDA) {
           "Shouldn't use shared memory buffer!");
     }
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
-    auto cg_outputs =
-        fec.runFusionWithInputs({aten_input, aten_weight, aten_bias});
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto cg_outputs = executor_cache.runFusionWithInputs(
+        {aten_input, aten_weight, aten_bias});
 
     testValidate(
         &fusion_copy,
@@ -7260,8 +7265,8 @@ TEST_F(NVFuserTest, FusionInstanceNormNHWC_CUDA) {
     outputs.push_back(t4);
   }
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__);
 }
 
@@ -7401,9 +7406,9 @@ TEST_F(NVFuserTest, AllInputDtypes) {
 
     CompileParams opt{.index_type = index_type};
 
-    FusionExecutor fe;
-    fe.compileFusion(fusion.get(), args, LaunchParams{}, opt);
-    auto outputs = fe.runFusion(args, LaunchParams{}, opt);
+    KernelExecutor ke;
+    ke.compile(fusion.get(), args, LaunchParams{}, opt);
+    auto outputs = ke.run(args, LaunchParams{}, opt);
 
     auto kernel_result = outputs.at(0).item<double>();
     auto expect = ee.evaluate(output).as<at::Tensor>().item<double>();
@@ -7521,9 +7526,9 @@ TEST_F(NVFuserTest, OpaqueTupleAsComplex) {
   KernelArgumentHolder args;
   args.push(Opaque(std::array<float, 2>{1.2, 3.4}));
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion(args);
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto outputs = ke.run(args);
 
   EXPECT_EQ(
       outputs.at(0).item<c10::complex<float>>(), c10::complex<float>(1.2, 3.4));
@@ -7548,9 +7553,9 @@ TEST_F(NVFuserTest, StructConstruct) {
 
   fusion.addOutput(tv);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({1.2, 3.4});
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto outputs = ke.run({1.2, 3.4});
 
   EXPECT_EQ(
       outputs.at(0).item<c10::complex<float>>(), c10::complex<float>(1.2, 3.4));
@@ -7586,12 +7591,12 @@ TEST_F(NVFuserTest, VectorizationStrideValidation) {
   auto t0 = at::randn(shape, options).expand({-1, 5, -1});
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   // This previously triggered a false positive error with the stride
   // validation
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   ASSERT_TRUE(cg_outputs[0].equal(t0));
 }
@@ -7615,10 +7620,10 @@ TEST_F(NVFuserTest, ConstLongExpressions) {
   auto tv0 = full({}, s1, DataType::Int);
   fusion->addOutput(tv0);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion);
+  KernelExecutor ke;
+  ke.compile(fusion);
 
-  auto outputs = fe.runFusion({});
+  auto outputs = ke.run({});
 
   testValidate(fusion, outputs, {}, __LINE__, __FILE__);
 }
@@ -7687,10 +7692,10 @@ TEST_F(NVFuserTest, PredicateRNGOps) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   at::Tensor t0 = at::zeros({2048, size}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0});
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 }
 
 TEST_F(NVFuserTest, LoweringHook) {
@@ -7849,8 +7854,8 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) {
   NVF_CHECK(kernel_runtime->isSegmented(), "segmentation didn't happen");
   const auto num_segments = kernel_runtime->fusionSegments()->groups().size();
   NVF_CHECK(num_segments == 3, "Expect 3 segments, got: ", num_segments);
-  for (const auto& fe : kernel_runtime->executors()) {
-    for (auto expr : fe.fusion()->exprs()) {
+  for (const auto& ke : kernel_runtime->executors()) {
+    for (auto expr : ke.fusion()->exprs()) {
       if (expr->isA<SliceOp>()) {
         auto slice = expr->as<SliceOp>();
         NVF_CHECK(
@@ -7877,9 +7882,9 @@ TEST_F(NVFuserTest, UnsupportedBFloat) {
   fusion.addInput(tv0);
   fusion.addOutput(tv1);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   EXPECT_THAT(
-      [&]() { fe.compileFusion(&fusion); },
+      [&]() { ke.compile(&fusion); },
       testing::ThrowsMessage<nvfuser::nvfError>(
           testing::HasSubstr("Reason: Fusion contains BFloat16")));
 }
@@ -7943,9 +7948,9 @@ TEST_F(NVFuserTest, BlockReduction3D) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::Tensor t0 = at::randn(shape, options);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     auto ref = t0.sum(0).sum(-1);
     testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
   };
@@ -7986,9 +7991,9 @@ TEST_F(NVFuserTest, ReverseMerge) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({11, 12}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   ASSERT_TRUE(t0.equal(cg_outputs.at(0)));
 }
 
@@ -8016,9 +8021,9 @@ TEST_F(NVFuserTest, FusionCpAsyncPredicateAvoidIllegalMemoryAccess) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({m, n}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   ASSERT_TRUE(t0.equal(cg_outputs.at(0)));
 }
 
@@ -8350,9 +8355,9 @@ TEST_F(NVFuserTest, BroadcastFromNowhereFusion) {
   // TODO: use larger tensor size
   at::Tensor t0 = at::randn({4}, options);
   at::Tensor t1 = at::randn({2, 4}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
 
@@ -8399,8 +8404,8 @@ TEST_F(NVFuserTest, ReplayRFactorMergeBcast) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::Tensor at_x = at::ones(input_shape, options);
     std::vector<c10::IValue> aten_inputs = {at_x};
-    FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
     testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
   }
@@ -8434,9 +8439,9 @@ TEST_F(NVFuserTest, MultipleDifferentSizeGridReduction) {
   const at::Tensor t1 = at::randn({192}, options);
   const std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   testValidate(&fusion, cg_outputs, inputs, __LINE__, __FILE__);
 }
@@ -8869,11 +8874,123 @@ TEST_F(NVFuserTest, CpAsyncDataTypeBool) {
   //    "r"((uint32_t)((!b3)))
   // );
   // If not correctly lowered, would trigger error in compile
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
+
+// Intermediate IDs generaetd by rFactor should also remain
+// reductions. See #3327 for more info.
+TEST_F(NVFuserTest, RfactorIntermediateIDs) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2});
+  fusion.addOutput(tv1);
+
+  tv1->merge(1, 2);
+  tv1->split(1, 4);
+
+  auto tv2 = tv1->rFactor({-1});
+
+  EXPECT_TRUE(tv2->axis(-1)->isReduction());
+  EXPECT_FALSE(tv2->axis(-2)->isReduction());
+
+  auto split = dynamic_cast<Split*>(tv2->axis(-1)->definition());
+  ASSERT_NE(split, nullptr);
+
+  auto merge_out = split->in();
+  EXPECT_TRUE(merge_out->isReduction());
+}
+
+// Simple test to make sure replacement with a dependent val is
+// detected as an error
+TEST_F(NVFuserTest, AvoidReplacingWithDependentVal) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto i0 = IrBuilder::create<Val>(DataType::Int);
+  fusion.addInput(i0);
+
+  auto i1 = mul(i0, IrBuilder::create<Val>(1, DataType::Int));
+
+  auto tv0 = TensorViewBuilder().shape({i1}).build();
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  std::unordered_map<Val*, Val*> replacement_map;
+  replacement_map.emplace(i0, i1);
+
+  EXPECT_THAT(
+      [&]() { ir_utils::replaceValue(&fusion, replacement_map); },
+      testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
+          "not allowed as it would result in a recursive definition")));
+}
+
+// Was also a repro of issue #3347
+TEST_F(NVFuserTest, ReplaceSymbolicSizesPreferSimplerExtents) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto i0 = IrBuilder::create<Val>(DataType::Index);
+  fusion.addInput(i0);
+
+  auto tv2 = reshape(tv0, {i0});
+  auto tv3 = reshape(tv1, {i0});
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  ExpressionEvaluator expr_eval;
+
+  expr_eval.bind(tv0->axis(0)->extent(), 2L);
+  expr_eval.bind(tv0->axis(1)->extent(), 4L);
+  expr_eval.bind(tv0->axis(2)->extent(), 8L);
+  expr_eval.bind(tv1->axis(0)->extent(), 8L);
+  expr_eval.bind(tv1->axis(1)->extent(), 8L);
+  expr_eval.bind(i0, 64L);
+
+  auto initial_info = DynamicTransform::getInitialInfo(&fusion);
+  auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval);
+
+  DynamicTransform::concretizeFusion(&fusion, &info);
+
+  replaceSymbolicSizes(&fusion);
+
+  // All expr output tensors should use the extent of the tv3 since it
+  // has only one merge, whereas tv2 has two merges
+  // All expr output tensors should use the same extent.
+  auto ref_ext = fusion.outputs().at(0)->as<TensorView>()->axis(0)->extent();
+
+  // ref_ext should look like getMetaData(T1).logical_size[0] *
+  // getMetaData(T1).logical_size[1]
+  auto ext_def = dynamic_cast<BinaryOp*>(ref_ext->definition());
+  ASSERT_NE(ext_def, nullptr);
+  ASSERT_EQ(ext_def->getBinaryOpType(), BinaryOpType::Mul);
+  auto lhs = ext_def->input(0);
+  auto rhs = ext_def->input(1);
+  ASSERT_NE(dynamic_cast<GetItem*>(lhs->definition()), nullptr);
+  ASSERT_NE(dynamic_cast<GetItem*>(rhs->definition()), nullptr);
+
+  for (auto expr : fusion.exprs()) {
+    auto tv_output = ir_utils::getTvOutput(expr);
+    ASSERT_EQ(tv_output->nDims(), 1);
+    auto ext = tv_output->axis(0)->extent();
+    EXPECT_EQ(ref_ext, ext) << "Reference: " << ref_ext->toString()
+                            << ", actual: " << ext->toString();
+  }
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_compute_with.cpp b/tests/cpp/test_gpu_compute_with.cpp
index 9ef2cfced37..df3b5a9bff1 100644
--- a/tests/cpp/test_gpu_compute_with.cpp
+++ b/tests/cpp/test_gpu_compute_with.cpp
@@ -164,9 +164,9 @@ TEST_F(NVFuserTest, FusionComputeWith1_CUDA) {
 
   at::Tensor t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -219,9 +219,9 @@ TEST_F(NVFuserTest, FusionComputeWith2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({dimx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false);
 
@@ -261,9 +261,9 @@ TEST_F(NVFuserTest, FusionComputeWith3_CUDA) {
 
   at::Tensor t0 = at::randn({123}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -307,9 +307,9 @@ TEST_F(NVFuserTest, FusionComputeWith4_CUDA) {
 
   at::Tensor t0 = at::randn({345, 10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -346,9 +346,9 @@ TEST_F(NVFuserTest, FusionComputeWith5_CUDA) {
 
   at::Tensor t0 = at::randn({345, 10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -449,9 +449,9 @@ TEST_F(NVFuserTest, FusionComputeWith6_CUDA) {
   const std::vector<int64_t> input_shape{N, H, W, C};
   auto t0 = at::randn(input_shape, options_half);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, LaunchParams());
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, LaunchParams());
+  auto cg_outputs = ke.run({t0});
 
   auto t1 = t0.to(at::kFloat);
   auto t2 = t1.mean({0, 1, 2});
diff --git a/tests/cpp/test_gpu_fused_reduction.cpp b/tests/cpp/test_gpu_fused_reduction.cpp
index c29460a4241..6080862aed5 100644
--- a/tests/cpp/test_gpu_fused_reduction.cpp
+++ b/tests/cpp/test_gpu_fused_reduction.cpp
@@ -115,9 +115,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({nx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = sum(t0).unsqueeze(0) + t0;
 
@@ -164,9 +164,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({nx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = sum(t0).unsqueeze(0) + t0;
 
@@ -212,9 +212,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce3_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({nx, ny}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = sum(t0, {1}).unsqueeze(-1) + t0;
 
@@ -257,9 +257,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce4_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({nx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (sum(t0) + 1).unsqueeze(0) + t0;
 
@@ -319,9 +319,9 @@ TEST_F(NVFuserTest, FusionGridAllreduce5_CUDA) {
   auto t0 = at::randn({iter, nx}, options);
   auto t5 = at::randn({bdimy, bdimx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t5});
-  auto cg_outputs = fe.runFusion({t0, t5});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t5});
+  auto cg_outputs = ke.run({t0, t5});
 
   auto ref = (sum(t0, {1}) + 1).unsqueeze(-1) + t0;
 
@@ -371,14 +371,14 @@ TEST_F(NVFuserTest, FusionGridAllreduce6_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) {
@@ -417,9 +417,9 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({nx}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref =
       (t0.mean({0}).unsqueeze(0) + t0) + t0.var({0}, false).unsqueeze(0) * nx;
@@ -467,9 +467,9 @@ TEST_F(NVFuserTest, FusionGridAllreduceWelford2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({nx, ny}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (sum(t0, {1}) / ny).unsqueeze(-1) + t0;
 
@@ -586,10 +586,10 @@ TEST_F(NVFuserTest, FusionFusedReductionBatchnorm_CUDA) {
   GpuLower gpulw(&fusion);
   validateNoParallelBroadcastExist(gpulw.run());
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   LaunchParams launch_params(2, 2, -1, -1, -1, -1);
-  fe.compileFusion(&fusion, aten_inputs, launch_params);
-  auto cg_outputs = fe.runFusion(aten_inputs, launch_params);
+  ke.compile(&fusion, aten_inputs, launch_params);
+  auto cg_outputs = ke.run(aten_inputs, launch_params);
 
   auto t5 = t0.to(at::kFloat);
   auto t6 = t1.to(at::kFloat);
@@ -653,13 +653,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction1_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto ref = t0.sum({1}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grouping reductions with different ops
@@ -698,13 +698,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction2_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum({1}) + std::get<0>((t0 + 2).max(1));
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grouped reduction with different types
@@ -741,13 +741,13 @@ TEST_F(NVFuserTest, FusionGroupedReduction3_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto ref = t0.sum({1}) + t0.to(c10::kDouble).sum({1}).to(c10::kFloat);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Testing validation
@@ -829,11 +829,11 @@ TEST_F(NVFuserTest, FusionGroupedReduction6_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
-  testValidate(fe.kernel(), outputs, {t0}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGroupedReduction7_CUDA) {
@@ -892,13 +892,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor1_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto ref = t0.sum({0}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Rfactoring grouped reductions
@@ -937,13 +937,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionRfactor2_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto ref = t0.sum({0}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Group reductions of tensors that have computeAt positions set
@@ -983,13 +983,13 @@ TEST_F(NVFuserTest, FusionGroupedReductionAfterComputeAt_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum({1}) * 2;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) {
@@ -1023,14 +1023,14 @@ TEST_F(NVFuserTest, FusionGroupAllreduce1_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t3 = t0.sum({0}).unsqueeze(-1);
   auto ref = t0 + t3 + t3;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grid reductionso of different types
@@ -1076,15 +1076,15 @@ TEST_F(NVFuserTest, FusionGroupAllreduce2_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t2 = t0.sum({1}).unsqueeze(-1);
   auto t6 = t0.to(c10::kDouble).sum({1}).unsqueeze(-1).to(c10::kFloat);
   auto ref = t0 + t2 + t6;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Grouping 3 grid allreduces
@@ -1124,15 +1124,15 @@ TEST_F(NVFuserTest, FusionGroupAllreduce3_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t3 = t0 / t0.sum({0}).unsqueeze(0);
   auto t6 = t0 / std::get<0>(t0.max(0)).unsqueeze(0);
   auto t9 = t0 - std::get<0>(t0.min(0)).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {t3, t6, t9}, __LINE__, __FILE__);
 }
 
 // Grouping 8 grid allreduces
@@ -1177,9 +1177,9 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   at::Tensor ref = t0;
   for (int i = 0; i < num_reductions; ++i) {
@@ -1189,7 +1189,7 @@ TEST_F(NVFuserTest, FusionGroupAllreduce4_CUDA) {
     ref = add(ref, bc);
   }
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Variation of FusionGroupAllreduce5_CUDA but with different
@@ -1265,9 +1265,9 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) {
 
   std::vector<at::indexing::TensorIndex> indices({at::indexing::Slice(0, 10)});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto t3 = t0 / t0.sum({0}).unsqueeze(0).to(at::kComplexDouble);
   auto t7 = t4 / t4.sum({0}).unsqueeze(0).to(at::kComplexDouble);
@@ -1275,7 +1275,7 @@ TEST_F(NVFuserTest, FusionGroupAllreduce5_CUDA) {
   auto t15 = t12 / t12.sum({0}).unsqueeze(0).to(at::kComplexDouble);
   auto t19 = t16 / t16.sum({0}).unsqueeze(0).to(at::kComplexDouble);
   auto ref = t3 + t7 + t11 + t15 + t19;
-  testValidate(fe.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, aten_inputs, {ref}, __LINE__, __FILE__);
 }
 
 // Persistent batchnorm backward with grouped allreduce
@@ -1428,14 +1428,14 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) {
   GpuLower gpulw(&fusion);
   validateNoParallelBroadcastExist(gpulw.run());
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   if (bidx * bidy > deviceSMCount()) {
     GTEST_SKIP() << "Not enough SMs to run this test";
   }
 
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   std::vector<int64_t> at_reduction_axes;
   std::copy(
@@ -1483,7 +1483,7 @@ TEST_F(NVFuserTest, FusionPersistentBNBackwardAllreduce_CUDA) {
   }
 
   testValidate(
-      fe.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__);
+      ke.kernel(), outputs, aten_inputs, {at_grad_input}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) {
@@ -1534,14 +1534,14 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) {
 
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = (t0_double + 1).sum({0}) + (t0_double + 2).sum({0});
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Channels-last batch norm with vectorization. Relies on re-entrant
@@ -1649,9 +1649,9 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) {
   auto t2 = at::randn({shape.back()}, options_float);
   std::vector<c10::IValue> aten_inputs({t0, t1, t2});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto t0_double = t0.to(at::kDouble);
   auto t1_double = t1.to(at::kDouble);
@@ -1664,7 +1664,7 @@ TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) {
       (t1_double - t2_double.unsqueeze(0).unsqueeze(0).unsqueeze(0));
   auto t9 = t8.sum(at_reduction_axes);
 
-  testValidate(fe.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, aten_inputs, {t5, t9}, __LINE__, __FILE__);
 }
 
 // Test the grouped grid allreduce with BN-like outer reductions
@@ -1780,9 +1780,9 @@ TEST_F(
   auto t2 = at::randn({shape.back()}, options_float);
   std::vector<c10::IValue> aten_inputs({t0, t1, t2});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto t0_double = t0.to(at::kDouble);
   auto t1_double = t1.to(at::kDouble);
@@ -1801,7 +1801,7 @@ TEST_F(
   auto t13 = t1_double + t12;
 
   testValidate(
-      fe.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__);
+      ke.kernel(), outputs, aten_inputs, {t11, t13}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) {
@@ -1868,14 +1868,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Test grouping of two domains
@@ -1946,14 +1946,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Group both expressions and iterations
@@ -2030,16 +2030,16 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce3_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto t4 = t0_double + 1 + (t0_double + 1).sum({0}).unsqueeze(0);
   auto t8 = t0_double + 2 + (t0_double + 2).sum({0}).unsqueeze(0);
   auto ref = t4 + t8;
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // ParallelType::Group with computeAt
@@ -2122,14 +2122,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduce4_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.sum({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) {
@@ -2183,14 +2183,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford1_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.mean({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Test grouping of two domains
@@ -2248,14 +2248,14 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelford2_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto outputs = ke.run({t0});
 
   auto t0_double = t0.to(at::kDouble);
   auto ref = t0_double + t0_double.mean({0}).unsqueeze(0);
 
-  testValidate(fe.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
 // Follows the pattern of persistent outer grid welford in batchnorm
@@ -2385,8 +2385,8 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) {
         params.N, params.H, params.W, params.C};
     auto t0 = at::randn(input_shape, options);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
 
     // Skip the rest of this test size if the required number of SMs
     // exceeds the available SM count
@@ -2397,7 +2397,7 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) {
       return;
     }
 
-    auto cg_outputs = fe.runFusion({t0});
+    auto cg_outputs = ke.run({t0});
 
     auto t1 = t0.to(at::kDouble);
     auto t2 = t1.mean({0, 1, 2}).unsqueeze(0).unsqueeze(0).unsqueeze(0);
@@ -2541,9 +2541,9 @@ TEST_F(NVFuserTest, FusionCrossEntropyGatherPattern_CUDA) {
       at::randint(0, num_classes, {batch_size}, options.dtype(at::kLong));
   std::vector<c10::IValue> inputs = {at_log_probs, at_labels};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   auto ref = at::gather(at_log_probs, 1, at_labels.unsqueeze(1)).squeeze();
 
diff --git a/tests/cpp/test_gpu_indexing_ops.cpp b/tests/cpp/test_gpu_indexing_ops.cpp
index 456fe0c2e46..ed5c4a5ce86 100644
--- a/tests/cpp/test_gpu_indexing_ops.cpp
+++ b/tests/cpp/test_gpu_indexing_ops.cpp
@@ -396,9 +396,9 @@ TEST_F(NVFuserTest, FusionIndexSelect_Sum_CUDA) {
   std::vector<c10::IValue> aten_inputs = {input1, input0, input_idx};
   auto heuristic_params = SchedulerEntry::scheduleWith(
       &fusion, SchedulerType::Reduction, aten_inputs);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, heuristic_params->lparams);
-  fe.runFusion(aten_inputs, {cg_output}, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs, heuristic_params->lparams);
+  ke.run(aten_inputs, {cg_output}, heuristic_params->lparams);
 
   auto tv0_ref = at::index_select(input0, 0, input_idx);
   at::Tensor tv2_ref = tv0_ref * input1;
diff --git a/tests/cpp/test_gpu_outer_reduction.cpp b/tests/cpp/test_gpu_outer_reduction.cpp
index afaf89b7aeb..7934689844c 100644
--- a/tests/cpp/test_gpu_outer_reduction.cpp
+++ b/tests/cpp/test_gpu_outer_reduction.cpp
@@ -115,11 +115,11 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) {
     auto t0 = at::randn(input_shape, options);
     std::vector<c10::IValue> aten_inputs = {t0};
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, aten_inputs);
+    KernelExecutor ke;
+    ke.compile(&fusion, aten_inputs);
 
     NVF_CHECK(
-        fe.kernel()->summary().has_outer_grouped_grid_welford ==
+        ke.kernel()->summary().has_outer_grouped_grid_welford ==
             params.should_use_opt,
         (params.should_use_opt ? "Failed to use the optimized implementation"
                                : "Should not use the optimized implementation"),
@@ -132,7 +132,7 @@ TEST_F(OuterReductionTest, GroupedGridWelfordOuterOpt) {
         ", ",
         params.bidx);
 
-    auto cg_outputs = fe.runFusion(aten_inputs);
+    auto cg_outputs = ke.run(aten_inputs);
 
     auto t1 = t0;
     auto t2 = params.dtype == DataType::Half ? t1.to(at::kFloat) : t1;
@@ -638,8 +638,8 @@ void grid_persistent_reduction_outer_norm_like(
   const std::vector<int64_t> input_shape{N, HW, HW, C};
   auto t0 = at::randn(input_shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb);
 
@@ -648,12 +648,12 @@ void grid_persistent_reduction_outer_norm_like(
                  << params.bidx * bidy << ", available: " << deviceSMCount();
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   if (benchmark_mode) {
     for (int i = 0; i < 10; ++i) {
       clearL2Cache();
-      cg_outputs = fe.runFusion({t0});
+      cg_outputs = ke.run({t0});
     }
   }
 
@@ -737,8 +737,8 @@ void grid_persistent_welford_outer_norm_like(
   const std::vector<int64_t> input_shape{N, HW, HW, C};
   auto t0 = at::randn(input_shape, options_half);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb);
 
@@ -747,12 +747,12 @@ void grid_persistent_welford_outer_norm_like(
                  << params.bidx * bidy << ", available: " << deviceSMCount();
   }
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   if (benchmark_mode) {
     for (int i = 0; i < 10; ++i) {
       clearL2Cache();
-      cg_outputs = fe.runFusion({t0});
+      cg_outputs = ke.run({t0});
     }
   }
 
@@ -898,8 +898,8 @@ void grid_persistent_batchnorm_manual(
   std::vector<c10::IValue> aten_inputs(
       {at_input_nvfuser, at_weight, at_bias, at_running_mean, at_running_var});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), aten_inputs);
 
   auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb);
 
@@ -908,7 +908,7 @@ void grid_persistent_batchnorm_manual(
                  << params.bidx * bidy << ", available: " << deviceSMCount();
   }
 
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
   cg_outputs.at(2) = cg_outputs.at(2).permute({0, 3, 1, 2});
 
   auto at_output = at::batch_norm(
@@ -923,7 +923,7 @@ void grid_persistent_batchnorm_manual(
       true);
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       {cg_outputs.at(2)},
       aten_inputs,
       {at_output},
@@ -934,7 +934,7 @@ void grid_persistent_batchnorm_manual(
   if (benchmark_mode) {
     for (int i = 0; i < 10; ++i) {
       clearL2Cache();
-      cg_outputs = fe.runFusion(aten_inputs);
+      cg_outputs = ke.run(aten_inputs);
     }
   }
 }
@@ -1037,8 +1037,8 @@ void grid_persistent_reduction_outer_norm_bwd_like(
   auto t1 = at::randn(input_shape, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb);
 
@@ -1047,12 +1047,12 @@ void grid_persistent_reduction_outer_norm_bwd_like(
                  << params.bidx * bidy << ", available: " << deviceSMCount();
   }
 
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   if (benchmark_mode) {
     for (int i = 0; i < 10; ++i) {
       clearL2Cache();
-      cg_outputs = fe.runFusion(aten_inputs);
+      cg_outputs = ke.run(aten_inputs);
     }
   }
 
@@ -1224,8 +1224,8 @@ void grid_persistent_batchnorm_bwd_manual(
 
   std::vector<at::Tensor> cg_outputs;
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion_ptr.get(), aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion_ptr.get(), aten_inputs);
 
   auto bidy = ceilDiv(ceilDiv(N * HW * HW, params.tidy), params.pb);
 
@@ -1234,7 +1234,7 @@ void grid_persistent_batchnorm_bwd_manual(
                  << params.bidx * bidy << ", available: " << deviceSMCount();
   }
 
-  cg_outputs = fe.runFusion(aten_inputs);
+  cg_outputs = ke.run(aten_inputs);
   // Permute grad_input output
   cg_outputs.at(0) = cg_outputs.at(0).permute({0, 3, 1, 2});
 
@@ -1251,7 +1251,7 @@ void grid_persistent_batchnorm_bwd_manual(
       {true, true, true});
 
   testValidate(
-      fe.kernel(),
+      ke.kernel(),
       cg_outputs,
       aten_inputs,
       {std::get<0>(at_output), std::get<1>(at_output), std::get<2>(at_output)},
@@ -1262,7 +1262,7 @@ void grid_persistent_batchnorm_bwd_manual(
   if (benchmark_mode) {
     for (int i = 0; i < 10; ++i) {
       clearL2Cache();
-      cg_outputs = fe.runFusion(aten_inputs);
+      cg_outputs = ke.run(aten_inputs);
     }
   }
 }
@@ -2181,22 +2181,20 @@ TEST_F(OuterReductionTest, IterGroupedBlockReduction) {
   rparams->unroll_factor_iter_dom = vect_factor;
 
   scheduler->schedule(&fusion, rparams);
-  FusionExecutor fusion_executor;
-  fusion_executor.compileFusion(
-      &fusion, aten_inputs, heuristic_params->lparams);
-  auto cg_outputs =
-      fusion_executor.runFusion(aten_inputs, heuristic_params->lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs, heuristic_params->lparams);
+  auto cg_outputs = ke.run(aten_inputs, heuristic_params->lparams);
 
   // lowering & check iteration grouped reductions
   NVF_CHECK(
-      fusion_executor.kernel()->summary().has_iter_grouped_reductions,
+      ke.kernel()->summary().has_iter_grouped_reductions,
       "There must be iter domain grouped reductions.");
   NVF_CHECK(
-      fusion_executor.kernel()->summary().num_grouped_iterations == vect_factor,
+      ke.kernel()->summary().num_grouped_iterations == vect_factor,
       "Expected ",
       vect_factor,
       " grouped iterations, found ",
-      fusion_executor.kernel()->summary().num_grouped_iterations);
+      ke.kernel()->summary().num_grouped_iterations);
 
   testValidate(
       &fusion,
@@ -2292,9 +2290,9 @@ void shmooTestsOfIterGroupedBlockOrGridReduction(
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs, lparams);
+  auto cg_outputs = ke.run(aten_inputs, lparams);
 
   testValidate(
       &fusion,
@@ -2543,15 +2541,15 @@ TEST_F(OuterReductionTest, IterGroupedMultipleReductions) {
       << "Expect 2 Iteration domain grouped grid reductions, got: "
       << num_iter_grouped_reductions;
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   std::vector<int64_t> shape({redu_dim, iter_dim});
   auto options = at::TensorOptions().device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
   auto t1 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+  ke.compile(&fusion, aten_inputs, lparams);
+  auto cg_outputs = ke.run(aten_inputs, lparams);
 
   testValidate(
       &fusion,
@@ -2595,10 +2593,10 @@ TEST_F(NVFuserTest, SmallOuterBlockReductionIssue2766) {
   auto t0 = at::randn({shape[0] * shape[1], shape[2]}, options);
   std::vector<c10::IValue> inputs({t0});
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
-  testValidate(fec.fusion(), outputs, inputs, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu_transpose.cpp b/tests/cpp/test_gpu_transpose.cpp
index 11c326878fc..13b01bfb8c9 100644
--- a/tests/cpp/test_gpu_transpose.cpp
+++ b/tests/cpp/test_gpu_transpose.cpp
@@ -547,9 +547,9 @@ TEST_F(TransposeTest, FusionManualScheduleTransposeComplexDAG1) {
   at::Tensor input1 = at::randn({1024, 512, 256}, options);
   at::Tensor input2 = at::randn({512, 256, 1024}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1, input2});
-  auto outputs = fe.runFusion({input0, input1, input2});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input0, input1, input2});
+  auto outputs = ke.run({input0, input1, input2});
 
   testValidate(&fusion, outputs, {input0, input1, input2}, __LINE__, __FILE__);
 }
@@ -987,9 +987,9 @@ TEST_F(TransposeTest, FusionTransposeBankConflict9) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({32, 32, 2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto outputs = ke.run({input});
 
   testValidate(&fusion, outputs, {input}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_gpu_view.cpp b/tests/cpp/test_gpu_view.cpp
index 725469b2647..c3319a7ef39 100644
--- a/tests/cpp/test_gpu_view.cpp
+++ b/tests/cpp/test_gpu_view.cpp
@@ -134,9 +134,9 @@ TEST_F(GpuViewTest, FusionViewAsRealOutput) {
   at::Tensor at_y = at::randn(output_shape, out_options);
   std::vector<c10::IValue> aten_inputs = {at_x, at_bias, at_y};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -279,8 +279,8 @@ void reductionViewAddFusion(
   at::Tensor at_bias = at::randn(bias_shape, options);
   std::vector<c10::IValue> aten_inputs = {at_x, at_bias};
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -445,8 +445,8 @@ void persistentViewAddFusion(
     at::Tensor at_bias = at::randn(bias_shape, options);
     std::vector<c10::IValue> aten_inputs = {at_x, at_bias};
 
-    FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-    auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
     testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
   }
@@ -637,9 +637,9 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain) {
   auto t0 = at::randn({2, 3}, options);
   auto t1 = at::randn({1, 6}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -668,8 +668,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain2) {
   at::Tensor at_bias = at::randn(output_shape, options);
   std::vector<c10::IValue> aten_inputs = {at_x, at_bias};
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -704,8 +704,8 @@ TEST_F(GpuViewTest, FusionReshapeConcreteDomain3) {
   at::Tensor at_z = at::randn(other_shape, options);
   std::vector<c10::IValue> aten_inputs = {at_x, at_y, at_z};
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -850,9 +850,9 @@ TEST_F(GpuViewTest, FusionFlattenAfterUnsqueezeOutput) {
   x_add_bias->computeAt(x_reshape, 1);
   x_reshape->axis(0)->parallelize(ParallelType::TIDx);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -914,15 +914,15 @@ TEST_F(GpuViewTest, FusionExpandRepro) {
   at::Tensor at_y = at::randn(input_shape2, options);
   std::vector<c10::IValue> aten_inputs = {at_x, at_y};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
   LaunchParams l_params;
-  auto outputs = fe.runFusion(aten_inputs, {}, l_params, {});
+  auto outputs = ke.run(aten_inputs, {}, l_params, {});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 
   // second run to verify cached output allocation
-  outputs = fe.runFusion(aten_inputs, {}, l_params, {});
+  outputs = ke.run(aten_inputs, {}, l_params, {});
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
 
@@ -1349,9 +1349,9 @@ TEST_F(GpuViewTest, FusionPwiseViewSchedule) {
   at::Tensor t0 = at::randn({x, y, z}, options);
   at::Tensor t3 = at::randn({x, y, z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t3});
-  auto cg_outputs = fe.runFusion({t0, t3});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t3});
+  auto cg_outputs = ke.run({t0, t3});
 
   testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__);
 }
@@ -1415,9 +1415,9 @@ TEST_F(GpuViewTest, FusionSumViewSchedule) {
   auto t5 = t4.sum({1});
   auto t6 = t0 + t3;
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t3});
-  auto cg_outputs = fe.runFusion({t0, t3});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t3});
+  auto cg_outputs = ke.run({t0, t3});
 
   testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__);
 }
@@ -1944,9 +1944,9 @@ TEST_F(GpuViewTest, FusionReshapeMapping) {
   at::Tensor t0 = at::randn({w, x, y * z}, options);
   at::Tensor t3 = at::randn({w, x * y, z}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t3});
-  auto cg_outputs = fe.runFusion({t0, t3});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t3});
+  auto cg_outputs = ke.run({t0, t3});
 
   testValidate(&fusion, cg_outputs, {t0, t3}, __LINE__, __FILE__);
 }
@@ -2318,9 +2318,9 @@ TEST_F(GpuViewTest, ExpandedBroadcast) {
   at::Tensor in_tensor =
       at::randn({4, 5}, at::dtype(at::kFloat).device(at::kCUDA, 0));
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {in_tensor});
-  at::Tensor actual_out_tensor = fe.runFusion({in_tensor})[0];
+  KernelExecutor ke;
+  ke.compile(&fusion, {in_tensor});
+  at::Tensor actual_out_tensor = ke.run({in_tensor})[0];
 
   testValidate(&fusion, {actual_out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
@@ -2697,9 +2697,9 @@ TEST_F(GpuViewTest, FusionMismatchingReshape) {
   // TODO: use larger tensor size once we are able to successfully parallelize
   // this fusion.
   at::Tensor t0 = at::randn({2, 3, 5}).to(options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_host_irs.cpp b/tests/cpp/test_host_irs.cpp
index a651fb5778d..bee0c2979dc 100644
--- a/tests/cpp/test_host_irs.cpp
+++ b/tests/cpp/test_host_irs.cpp
@@ -346,7 +346,7 @@ TEST_P(HostIrTest, ThreeFusions) {
 
   // [Step 8)] Execute the Host program
   HostIrExecutorParams params;
-  // we test two different modes of the HostIrExecutor: using FusionExecutor or
+  // we test two different modes of the HostIrExecutor: using KernelExecutor or
   // FusionExecutorCache
   auto [use_fusion_executor_cache] = GetParam();
   params.use_fusion_executor_cache = use_fusion_executor_cache;
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index 23c48dfc0b7..1862747ec47 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -1773,9 +1773,9 @@ TEST_F(IndexingTest, SmemAllocationDomainForTranspose) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input0 = at::randn({256, 256}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0});
-  auto outputs = fe.runFusion({input0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input0});
+  auto outputs = ke.run({input0});
 
   testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__);
 }
@@ -2151,6 +2151,11 @@ TEST_F(IndexingTest, DoubleBuffering6) {
         return nullptr;
       }
 
+      // This loop is double buffered. Since the loop originally has
+      // just a trip count of 2, the double-buffered main loop has a
+      // trip count of 1. Thus, this loop is always trivial
+      loop_indices.at(1) = tv->fusion()->zeroVal();
+
       switch (tv->name()) {
         case 1: {
           if (!as_consumer) {
@@ -3040,9 +3045,9 @@ TEST_F(PredicateIndexingTest, DoubleBuffering1) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -3139,9 +3144,9 @@ TEST_F(PredicateIndexingTest, CircularBuffering1) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -3306,9 +3311,9 @@ TEST_F(PredicateIndexingTest, UnrolledCircularBuffering) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -3387,9 +3392,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering1) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -3476,9 +3481,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering2) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -3582,9 +3587,9 @@ TEST_P(PredicateIndexingTest, UnswitchedCircularBuffering3) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -3661,9 +3666,9 @@ TEST_F(PredicateIndexingTest, UnswitchedCircularBuffering4) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({16}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -3754,9 +3759,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplit1) {
   at::Tensor t0 = at::randn({999}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3845,9 +3850,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitch) {
   at::Tensor t0 = at::randn({999}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3940,9 +3945,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithCircularBuffering) {
   at::Tensor t0 = at::randn({999}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4051,9 +4056,9 @@ TEST_F(
   at::Tensor t0 = at::randn({999}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4136,9 +4141,9 @@ TEST_P(PredicateIndexingTest, UnswitchPredicateIssueRepro681) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   auto ref = t0.to(at::kDouble).sum();
 
@@ -4296,9 +4301,9 @@ TEST_F(PredicateIndexingTest, NonDivisibleSplitWithUnswitchAndBroadcast) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4419,9 +4424,9 @@ TEST_F(PredicateIndexingTest, UnswitchConsolidationDifferentThreading) {
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -4834,9 +4839,9 @@ TEST_F(ContigIndexingTest, ConcretizedBroadcastMerge) {
   auto t1 = at::randn({5, 6, 7}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -5063,9 +5068,9 @@ TEST_F(ContigPredicateIndexingTest, NonDivisibleSplit1) {
   at::Tensor t0 = at::randn({10, 20}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_indexing_advanced.cpp b/tests/cpp/test_indexing_advanced.cpp
index 1787109a272..4b0674ca015 100644
--- a/tests/cpp/test_indexing_advanced.cpp
+++ b/tests/cpp/test_indexing_advanced.cpp
@@ -72,10 +72,10 @@ TEST_P(AdvancedIndexingTest, InlineBroadcast) {
   at::Tensor t0 = at::randn({123}, options);
   at::Tensor t1 = at::randn({3, 123}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
 
-  auto outputs = fe.runFusion({t0, t1});
+  auto outputs = ke.run({t0, t1});
 
   testValidate(&fusion, outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -117,15 +117,15 @@ TEST_P(AdvancedIndexingTest, 1) {
   tv2->axis(1)->parallelize(ParallelType::Unroll);
   tv2->axis(2)->parallelize(ParallelType::TIDx);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   at::Tensor t0 = at::randn({x, y, z}, options);
   at::Tensor t1 = at::randn({w, x, y, z}, options);
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -168,15 +168,15 @@ TEST_P(AdvancedIndexingTest, 2) {
   tv2->axis(1)->parallelize(ParallelType::Unroll);
   tv2->axis(2)->parallelize(ParallelType::TIDx);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   at::Tensor t0 = at::randn({x, y, z}, options);
   at::Tensor t1 = at::randn({w, x, y, z}, options);
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -229,9 +229,9 @@ TEST_P(AdvancedIndexingTest, 4) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -263,9 +263,9 @@ TEST_P(AdvancedIndexingTest, 5) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -345,9 +345,9 @@ TEST_P(AdvancedIndexingTest, 7) {
   auto at_t0 = at::randn({numel_x}, options);
   auto at_t1 = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t1});
-  auto cg_outputs = fe.runFusion({at_t0, at_t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_t0, at_t1});
+  auto cg_outputs = ke.run({at_t0, at_t1});
 
   auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
                          .to(at::kDouble)
@@ -391,9 +391,9 @@ TEST_P(AdvancedIndexingTest, 8) {
   auto at_t0 = at::randn({numel_x}, options);
   auto at_t1 = at::randn({numel_x, numel_y}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t1});
-  auto cg_outputs = fe.runFusion({at_t0, at_t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {at_t0, at_t1});
+  auto cg_outputs = ke.run({at_t0, at_t1});
 
   auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
                          .to(at::kDouble)
@@ -484,9 +484,9 @@ TEST_P(AdvancedIndexingTest, 10) {
   at::Tensor input2 = at::rand_like(input1);
   at::Tensor output = at::empty_like(input1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input1, input2});
+  ke.run({input1, input2}, {output});
 
   at::Tensor tv2_ref = input2 + 2.0;
   at::Tensor output_ref = input1 + tv2_ref;
@@ -531,15 +531,15 @@ TEST_P(AdvancedIndexingTest, 11) {
 
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   at::Tensor t0 = at::randn({w, x, y, z}, options);
   at::Tensor t1 = at::randn({x}, options);
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -574,9 +574,9 @@ TEST_P(AdvancedIndexingTest, 12) {
 
   std::vector<at::Tensor> aten_outputs = {t2, t4};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {aten_input});
+  auto cg_outputs = ke.run({aten_input});
 
   testValidate(
       &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
@@ -623,9 +623,9 @@ TEST_P(AdvancedIndexingTest, 13) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -665,9 +665,9 @@ TEST_P(AdvancedIndexingTest, 14) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -700,9 +700,9 @@ TEST_P(AdvancedIndexingTest, 15) {
   at::Tensor t3 = at::randn({bx, by, bz}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t3};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -732,9 +732,9 @@ TEST_P(AdvancedIndexingTest, 16) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -767,9 +767,9 @@ TEST_P(AdvancedIndexingTest, 17) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -804,13 +804,13 @@ TEST_P(AdvancedIndexingTest, 18) {
   at::Tensor t1 = at::randn({5, 3}, options);
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto cg_outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto cg_outputs = ke.run(inputs);
 
   auto ref = (t0.unsqueeze(-1) + t1).sum();
 
-  testValidate(fe.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__);
+  testValidate(ke.kernel(), cg_outputs, inputs, {ref}, __LINE__, __FILE__);
 }
 
 TEST_P(AdvancedIndexingTest, 19) {
@@ -848,9 +848,9 @@ TEST_P(AdvancedIndexingTest, 19) {
   at::Tensor t1 = at::randn({5, 11}, options);
   std::vector<c10::IValue> inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -913,9 +913,9 @@ TEST_F(AdvancedIndexingIdModelTest, 20) {
   at::Tensor t2 = at::randn({7, 13}, options);
   std::vector<c10::IValue> inputs = {t0, t1, t2};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 #endif
@@ -978,9 +978,9 @@ TEST_F(AdvancedIndexingIdModelTest, 21) {
   auto t6 = at::randn({3, 5, 7}, options);
   std::vector<c10::IValue> inputs = {t0, t3, t6};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 #endif
@@ -1022,9 +1022,9 @@ TEST_F(AdvancedIndexingIdModelTest, MultiPromotion1) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1119,9 +1119,9 @@ TEST_F(AdvancedIndexingIdModelTest, IndexSplitMerge) {
 
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(
       &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
diff --git a/tests/cpp/test_inlining.cpp b/tests/cpp/test_inlining.cpp
index 320241e6112..569b541a838 100644
--- a/tests/cpp/test_inlining.cpp
+++ b/tests/cpp/test_inlining.cpp
@@ -48,9 +48,9 @@ TEST_F(InliningTest, InliningMismatchedDims1) {
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({2, 3, 4}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
@@ -80,9 +80,9 @@ TEST_F(InliningTest, InliningMismatchedDims2) {
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({2, 3, 4}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
@@ -113,9 +113,9 @@ TEST_F(InliningTest, InliningMismatchedDims4) {
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({2, 3, 4}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
@@ -150,9 +150,9 @@ TEST_F(InliningTest, InliningBroadcast) {
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input = at::randn({2, 3, 4}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
+  KernelExecutor ke;
+  ke.compile(&fusion, {input});
+  auto cg_outputs = ke.run({input});
 
   testValidate(&fusion, cg_outputs, {input}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_loop_domain_scheduling.cpp b/tests/cpp/test_loop_domain_scheduling.cpp
index 52884529727..710be9ce08a 100644
--- a/tests/cpp/test_loop_domain_scheduling.cpp
+++ b/tests/cpp/test_loop_domain_scheduling.cpp
@@ -86,9 +86,9 @@ TEST_F(LoopDomainSchedulingTest, ReshapeSplitThenMerge) {
   auto t0 = at::randn({10}, options);
   std::vector<c10::IValue> inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
 
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
@@ -147,9 +147,9 @@ TEST_F(LoopDomainSchedulingTest, Slice) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)});
 
@@ -306,9 +306,9 @@ TEST_F(LoopDomainSchedulingTest, ManyReshape) {
     auto t0 = at::randn({12}, options);
     std::vector<c10::IValue> aten_inputs({t0});
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, aten_inputs);
-    auto cg_outputs = fe.runFusion(aten_inputs);
+    KernelExecutor ke;
+    ke.compile(&fusion, aten_inputs);
+    auto cg_outputs = ke.run(aten_inputs);
 
     auto ref = t0 * 2;
     EXPECT_TRUE(ref.equal(cg_outputs[0]));
diff --git a/tests/cpp/test_loop_rotation.cpp b/tests/cpp/test_loop_rotation.cpp
index 39314552945..db5f3e20848 100644
--- a/tests/cpp/test_loop_rotation.cpp
+++ b/tests/cpp/test_loop_rotation.cpp
@@ -76,9 +76,9 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   for (auto n : {1, 99}) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n, 3}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
   }
 }
@@ -169,9 +169,9 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   for (auto n : {1, 99}) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n, 3}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
   }
 }
@@ -278,9 +278,9 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   for (auto n : {1, 99}) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n, 3}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
   }
 }
@@ -389,9 +389,9 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   for (auto n : {5, 99}) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n, 3}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
   }
 }
@@ -526,9 +526,9 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   for (auto n : {5, 99}) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n, 3}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
   }
 }
@@ -662,9 +662,9 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   for (auto n : {5, 99}) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n, 3}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
     testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
   }
 }
diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp
index 1c0fca0ac89..12ed11bd554 100644
--- a/tests/cpp/test_matmul.cpp
+++ b/tests/cpp/test_matmul.cpp
@@ -124,19 +124,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmul) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -185,19 +185,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBroadcastBatch) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref =
       atMatmul(
           inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout)
@@ -243,19 +243,19 @@ TEST_P(MatmulTestWithLayout, AmperePrologueFusionBroadcast) {
 
   auto inputs = matmulAtInput2D(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -304,19 +304,19 @@ TEST_P(MatmulTestWithLayout, AmpereProloguePointwise) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.sin().to(at::kFloat),
       inputs.second.sin().to(at::kFloat),
@@ -365,19 +365,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBFloat16) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout, at::kBFloat16);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -428,19 +428,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulPipelineGmem) {
 
     auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-    FusionExecutor fe;
+    KernelExecutor ke;
     NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
         8,
         0,
-        fe.compileFusion(
+        ke.compile(
             &fusion,
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+    ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
     ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-    auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+    auto cg_outputs = ke.run({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
     NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -512,25 +512,25 @@ TEST_P(MatmulTestWithLayout, AmpereSwizzle) {
       FusionProfiler::createSegments(1);
     }
 
-    FusionExecutor fe;
+    KernelExecutor ke;
     NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
         8,
         0,
-        fe.compileFusion(
+        ke.compile(
             &fusion,
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+    ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
     ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-    auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+    auto cg_outputs = ke.run({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
     NVF_CHECK(cg_outputs[0].allclose(tref, 0.01, 0.01));
 
-    int gdimx = fe.lastLaunchParams().gdimx();
-    int gdimy = fe.lastLaunchParams().gdimy();
+    int gdimx = ke.lastLaunchParams().gdimx();
+    int gdimy = ke.lastLaunchParams().gdimy();
 
     int expected_gdim_unswizzled = (dim + 128 - 1) / 128;
     int expected_gdimx = expected_gdim_unswizzled * swizzle;
@@ -640,19 +640,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulRegCircularBuffer) {
 
     auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-    FusionExecutor fe;
+    KernelExecutor ke;
     NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
         8,
         0,
-        fe.compileFusion(
+        ke.compile(
             &fusion,
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+    ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
     ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-    auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+    auto cg_outputs = ke.run({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
     NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -932,16 +932,14 @@ TEST_F(MatmulTest, MatmulMatmulAmpere) {
                   .matmul(t1.t().to(at::kFloat))
                   .matmul(t2.t().to(at::kFloat));
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8,
-      0,
-      fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams));
+      8, 0, ke.compile(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams));
 
-  auto cg_outputs = fe.runFusion({t0, t1, t2});
+  auto cg_outputs = ke.run({t0, t1, t2});
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   // relaxed check for now, err accumulation is significant.
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.1, 0.1));
 }
@@ -1312,16 +1310,14 @@ TEST_F(MatmulTest, MatmulSoftmaxMatmulAmpere) {
   auto t1 = at::randn({N1, K1}, options);
   auto t2 = at::randn({N2, K2}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8,
-      0,
-      fe.compileFusion(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams));
+      8, 0, ke.compile(&fusion, {t0, t1, t2}, LaunchParams(), matmul_cparams));
 
-  auto cg_outputs = fe.runFusion({t0, t1, t2});
+  auto cg_outputs = ke.run({t0, t1, t2});
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   auto g1 = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
   auto sg1 = at::_softmax(g1, -1, false);
   auto gsg1 = sg1.matmul(t2.t().to(at::kFloat));
@@ -1367,13 +1363,13 @@ TEST_P(MatmulTestWithLayout, TuringMatmul) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second}));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      7, 5, ke.compile(&fusion, {inputs.first, inputs.second}));
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -1511,15 +1507,13 @@ TEST_F(MatmulTest, AmpereMatmulTNCpAsync) {
   auto t0 = at::randn({M, K}, options);
   auto t1 = at::randn({N, K}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8,
-      0,
-      fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
+      8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -1679,16 +1673,14 @@ TEST_F(MatmulTest, AmpereStridedBatchedMatmulTN) {
   auto t0 = at::randn({B0, M, B1, K}, options);
   auto t1 = at::randn({B0, N, B1, K}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8,
-      0,
-      fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
+      8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   // ref implementation:
   auto ref_t0 = t0.permute({0, 2, 1, 3})
                     .contiguous()
@@ -1852,16 +1844,14 @@ TEST_F(MatmulTest, AmpereViewMatmulTN) {
   auto t0 = at::randn({M, Ko, Ki}, options);
   auto t1 = at::randn({N, K}, options);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8,
-      0,
-      fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
+      8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   auto tref =
       at::native::view(t0, {M, K}).to(at::kFloat).matmul(t1.t().to(at::kFloat));
 
@@ -2040,11 +2030,11 @@ TEST_F(MatmulTest, AmpereMatmulTNSwizzled) {
   auto t0 = at::randn({M, K}, options);
   auto t1 = at::randn({N, K}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams);
+  auto cg_outputs = ke.run({t0, t1});
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2091,19 +2081,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoad) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2147,19 +2137,19 @@ TEST_P(MatmulTestWithLayout, TuringMatmulLargeLoad) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       7,
       5,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2219,19 +2209,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck4warp) {
 
       auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-      FusionExecutor fe;
+      KernelExecutor ke;
       NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
           8,
           0,
-          fe.compileFusion(
+          ke.compile(
               &fusion,
               {inputs.first, inputs.second},
               LaunchParams(),
               matmul_cparams));
-      EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-      auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty());
+      auto cg_outputs = ke.run({inputs.first, inputs.second});
       ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
       auto tref = atMatmul(
           inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
       NVF_CHECK(
@@ -2300,19 +2290,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck8warp) {
 
         auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-        FusionExecutor fe;
+        KernelExecutor ke;
         NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
             8,
             0,
-            fe.compileFusion(
+            ke.compile(
                 &fusion,
                 {inputs.first, inputs.second},
                 LaunchParams(),
                 matmul_cparams));
-        ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+        ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
         ASSERT_FALSE(
-            PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-        auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+            PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+        auto cg_outputs = ke.run({inputs.first, inputs.second});
         auto tref = atMatmul(
             inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
         NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2371,19 +2361,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulTileCheck6warp) {
 
     auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-    FusionExecutor fe;
+    KernelExecutor ke;
     NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
         8,
         0,
-        fe.compileFusion(
+        ke.compile(
             &fusion,
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+    ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
     ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-    auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+    auto cg_outputs = ke.run({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
     NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -2431,19 +2421,19 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulLargeLoadLargeK) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.001, 0.001));
@@ -2489,15 +2479,13 @@ TEST_P(MatmulTestWithLayout, AmpereSplitKLikeStridedBatchedMatmul) {
   auto t0 = matmulAtInput2D(layout, TensorMatmulPos::A, at::kHalf, M, N, K, B);
   auto t1 = matmulAtInput2D(layout, TensorMatmulPos::B, at::kHalf, M, N, K, B);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8,
-      0,
-      fe.compileFusion(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      8, 0, ke.compile(&fusion, {t0, t1}, LaunchParams(), matmul_cparams));
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({t0, t1});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = splitkLikeAtMatmul(t0.to(at::kFloat), t1.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -2578,23 +2566,23 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) {
     at::manual_seed(0);
     auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-    FusionExecutor fe;
+    KernelExecutor ke;
     NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
         8,
         0,
-        fe.compileFusion(
+        ke.compile(
             &fusion,
             {inputs.first, inputs.second},
             LaunchParams(),
             matmul_cparams));
-    auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+    auto cg_outputs = ke.run({inputs.first, inputs.second});
     auto tref = atMatmul(
         inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
     // check bank conflicts
-    ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+    ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
     ASSERT_FALSE(
-        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+        PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
     // (0.001, 0.001) passed on local A100 but failed on CI A100
     NVF_CHECK(
         cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2612,7 +2600,7 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogue) {
     //   - !use_smem_epilogue : A + B (this test is skipped in this case)
     //   - use_smem_epilogue && !promote_prologue_smem_reuse : A + B + C
     //   - use_smem_epilogue && promote_prologue_smem_reuse : max(A + B, C)
-    auto smem_allocs = fe.kernel()->summary().dynamic_smem_allocations;
+    auto smem_allocs = ke.kernel()->summary().dynamic_smem_allocations;
     NVF_CHECK(smem_allocs.size() == 3);
     if (mparams.promote_prologue_smem_reuse) {
       // Check prologue shared memory re-use
@@ -2712,29 +2700,29 @@ TEST_F(MatmulTest, AmpereMatmulSmemEpiloguePromotionRequiredA100) {
   SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
       ->schedule(&fusion, &mparams);
 
-  // FusionExecutor::compileFusion would fail otherwise.
+  // KernelExecutor::compile would fail otherwise.
   SKIP_IF_INSUFFICIENT_SMEM(&mparams, data_types);
 
   at::manual_seed(0);
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
   // check bank conflicts
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   // (0.001, 0.001) passed on local A100 but failed on CI A100
   NVF_CHECK(
       cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2818,23 +2806,23 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueCast) {
   at::manual_seed(0);
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   tref = tref.to(at::kHalf);
   // check bank conflicts
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   // (0.001, 0.001) passed on local A100 but failed on CI A100
   NVF_CHECK(
       cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -2914,24 +2902,24 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulSmemEpilogueRelu) {
   at::manual_seed(0);
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto t2 = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   auto tref = at::relu(t2).to(at::kFloat);
 
   // check bank conflicts
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
   // (0.001, 0.001) passed on local A100 but failed on CI A100
   NVF_CHECK(
       cg_outputs[0].allclose(tref, 0.01, 0.01),
@@ -3003,13 +2991,13 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitK_CUDA) {
 
       auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-      FusionExecutor fe;
+      KernelExecutor ke;
       NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-          7, 5, fe.compileFusion(&fusion, {inputs.first, inputs.second}));
-      EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-      auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+          7, 5, ke.compile(&fusion, {inputs.first, inputs.second}));
+      EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty());
+      auto cg_outputs = ke.run({inputs.first, inputs.second});
       ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
       auto tref = atMatmul(
           inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
@@ -3068,13 +3056,12 @@ TEST_P(MatmulTestWithLayout, FusionAmpereMatmulSplitKBias_CUDA) {
       at::Tensor aten_bias = at::randn({M}, aten_a.options());
       std::vector<c10::IValue> inputs = {aten_a, aten_b, aten_bias};
 
-      FusionExecutor fe;
-      NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-          7, 5, fe.compileFusion(&fusion, inputs));
-      EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-      auto cg_outputs = fe.runFusion(inputs);
+      KernelExecutor ke;
+      NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs));
+      EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty());
+      auto cg_outputs = ke.run(inputs);
       ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
+          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
       auto tref = atBiasEpilogue(
           atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout),
           aten_bias);
@@ -3131,13 +3118,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitK) {
 
       std::vector<c10::IValue> inputs = {aten_a, aten_b};
 
-      FusionExecutor fe;
-      NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-          7, 5, fe.compileFusion(&fusion, inputs));
-      ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      KernelExecutor ke;
+      NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs));
+      ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
       ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-      auto cg_outputs = fe.runFusion(inputs);
+          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+      auto cg_outputs = ke.run(inputs);
       auto tref =
           atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout);
 
@@ -3198,13 +3184,12 @@ TEST_P(MatmulTestWithLayout, AmpereMatmulBatchSplitKBias) {
 
       std::vector<c10::IValue> inputs = {aten_a, aten_b, aten_bias};
 
-      FusionExecutor fe;
-      NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-          7, 5, fe.compileFusion(&fusion, inputs));
-      ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      KernelExecutor ke;
+      NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(7, 5, ke.compile(&fusion, inputs));
+      ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
       ASSERT_FALSE(
-          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-      auto cg_outputs = fe.runFusion(inputs);
+          PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+      auto cg_outputs = ke.run(inputs);
       auto tref = atBiasEpilogue(
           atMatmul(aten_a.to(at::kFloat), aten_b.to(at::kFloat), layout),
           aten_bias);
@@ -3257,19 +3242,19 @@ TEST_F(MatmulTest, ReproIssue1808) {
 
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
       8,
       0,
-      fe.compileFusion(
+      ke.compile(
           &fusion,
           {inputs.first, inputs.second},
           LaunchParams(),
           matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -3413,16 +3398,15 @@ TEST_P(MatmulTestWithLayout, MisalignedVectorization) {
         SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
             ->schedule(fusion.get(), &mparams);
 
-        FusionExecutor fe;
+        KernelExecutor ke;
         NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
             8,
             0,
-            fe.compileFusion(
-                fusion.get(), inputs, LaunchParams(), matmul_cparams));
-        ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+            ke.compile(fusion.get(), inputs, LaunchParams(), matmul_cparams));
+        ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
         ASSERT_FALSE(
-            PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-        auto outputs = fe.runFusion(inputs);
+            PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+        auto outputs = ke.run(inputs);
 
         EXPECT_TRUE(outputs[0].allclose(tref, 0.001, 0.001));
       }
@@ -3473,13 +3457,13 @@ TEST_F(MatmulTest, MultipleConsecutiveDims) {
   at::Tensor B = at::randn({N1, N2, K}, options);
   std::vector<c10::IValue> inputs{A, B};
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams));
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion(inputs);
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run(inputs);
   auto tref = at::reshape(
       at::linear(
           at::reshape(A.to(at::kFloat), {M1 * M2, K}),
@@ -3539,13 +3523,13 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveMDims) {
   at::Tensor B = at::randn({N, K}, options);
   std::vector<c10::IValue> inputs{A, B};
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams));
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion(inputs);
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run(inputs);
   auto Apermuted = A.permute({{1, 2}}).reshape({M1 * M2, K});
   auto tref = at::linear(Apermuted.to(at::kFloat), B.to(at::kFloat))
                   .reshape({M1, M2, N})
@@ -3605,13 +3589,13 @@ TEST_F(MatmulTest, DISABLED_MultipleNonConsecutiveNDims) {
   at::Tensor B = at::randn({N1, K, N2}, options);
   std::vector<c10::IValue> inputs{A, B};
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams));
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion(inputs);
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run(inputs);
   auto Bpermuted = B.permute({{1, 2}}).reshape({N1 * N2, K});
   auto tref = at::linear(A.to(at::kFloat), Bpermuted.to(at::kFloat))
                   .reshape({M, N1, N2});
@@ -3663,13 +3647,13 @@ TEST_F(MatmulTest, MultipleMDimsBatch) {
   at::Tensor B = at::randn({Batch, N, K}, options);
   std::vector<c10::IValue> inputs{A, B};
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
-      8, 0, fe.compileFusion(&fusion, inputs, LaunchParams(), matmul_cparams));
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
+      8, 0, ke.compile(&fusion, inputs, LaunchParams(), matmul_cparams));
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
   ASSERT_FALSE(
-      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(fe.kernel()));
-  auto cg_outputs = fe.runFusion(inputs);
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+  auto cg_outputs = ke.run(inputs);
   auto tref =
       at::matmul(A.to(at::kFloat), at::permute(B.to(at::kFloat), {0, 2, 1}));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
@@ -3798,10 +3782,10 @@ TEST_F(HopperMatmulTest, HSH_NT_128BSwizzle) {
   auto inputs =
       matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype));
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout);
   EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5));
 }
diff --git a/tests/cpp/test_matmul_aten_evaluation.cpp b/tests/cpp/test_matmul_aten_evaluation.cpp
index 9dd8a927009..1d078b51d96 100644
--- a/tests/cpp/test_matmul_aten_evaluation.cpp
+++ b/tests/cpp/test_matmul_aten_evaluation.cpp
@@ -164,8 +164,8 @@ TEST_P(MatmulNodeParametrizedTest, MatmulNodeConcrete) {
   at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
   at::Tensor out_ref = at::matmul(t0, t1);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out = fec.runFusionWithInputs({t0, t1});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out = executor_cache.runFusionWithInputs({t0, t1});
 
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
@@ -190,8 +190,8 @@ TEST_P(MatmulNodeParametrizedTest, MatmulNodeSymbolic) {
   at::Tensor t1 = at::randn(b_shape, at::kHalf).cuda();
   at::Tensor out_ref = at::matmul(t0, t1);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out = fec.runFusionWithInputs({t0, t1});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out = executor_cache.runFusionWithInputs({t0, t1});
 
   EXPECT_TRUE(at::allclose(out[0], out_ref));
 }
@@ -227,17 +227,17 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeConcrete) {
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   std::vector<at::Tensor> out = {};
   if (bias_shape.has_value()) {
-    out = fec.runFusionWithInputs({t0, t1, bias_opt});
+    out = executor_cache.runFusionWithInputs({t0, t1, bias_opt});
   } else {
-    out = fec.runFusionWithInputs({t0, t1});
+    out = executor_cache.runFusionWithInputs({t0, t1});
   }
 
-  const std::vector<FusionExecutor>& executors =
-      fec.getMostRecentKernelRuntime()->executors();
+  const std::vector<KernelExecutor>& executors =
+      executor_cache.getMostRecentKernelRuntime()->executors();
   EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
@@ -277,17 +277,17 @@ TEST_P(LinearNodeParametrizedTest, LinearNodeSymbolic) {
   }
   at::Tensor out_ref = at::linear(t0, t1, bias_opt);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   std::vector<at::Tensor> out = {};
   if (bias_shape.has_value()) {
-    out = fec.runFusionWithInputs({t0, t1, bias_opt});
+    out = executor_cache.runFusionWithInputs({t0, t1, bias_opt});
   } else {
-    out = fec.runFusionWithInputs({t0, t1});
+    out = executor_cache.runFusionWithInputs({t0, t1});
   }
 
-  const std::vector<FusionExecutor>& executors =
-      fec.getMostRecentKernelRuntime()->executors();
+  const std::vector<KernelExecutor>& executors =
+      executor_cache.getMostRecentKernelRuntime()->executors();
   EXPECT_EQ(executors.size(), 1);
   // Verify that fusion compilation was skipped.
   EXPECT_FALSE(executors.front().hasCompiledKernel());
diff --git a/tests/cpp/test_matmul_sass.cpp b/tests/cpp/test_matmul_sass.cpp
index 974300401d2..0d4385eb46b 100644
--- a/tests/cpp/test_matmul_sass.cpp
+++ b/tests/cpp/test_matmul_sass.cpp
@@ -98,16 +98,16 @@ sass::Container getSASSFor(
 
   SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
       ->schedule(&fusion, &mparams);
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 
-  return sass::parse(fe.disassembledKernelSASS());
+  return sass::parse(ke.disassembledKernelSASS());
 }
 
 // A fusion with epilogue made of binary op (scalar multiplication)
@@ -161,13 +161,13 @@ sass::Container getBinaryOpMulEpilogueSASSFor(
   auto inputs = matmulAtInput3DTuring(M, N, K, layout);
   const double alpha = 2.5;
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion,
       {inputs.first, inputs.second, alpha},
       LaunchParams(),
       matmul_cparams);
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second, alpha});
+  auto cg_outputs = ke.run({inputs.first, inputs.second, alpha});
   auto tref = at::mul(
                   atMatmul(
                       inputs.first.to(at::kFloat),
@@ -178,7 +178,7 @@ sass::Container getBinaryOpMulEpilogueSASSFor(
 
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 
-  return sass::parse(fe.disassembledKernelSASS());
+  return sass::parse(ke.disassembledKernelSASS());
 }
 
 } // namespace
diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp
index e532f32ef57..f2109c7f1e1 100644
--- a/tests/cpp/test_matmul_scheduler.cpp
+++ b/tests/cpp/test_matmul_scheduler.cpp
@@ -2811,10 +2811,10 @@ TEST_P(AllocationDomainTest, BasicMatmul) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -2844,10 +2844,10 @@ TEST_P(AllocationDomainTest, BasicMatmulNoTranspose) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -2880,10 +2880,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSet) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -2918,10 +2918,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSin) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -2955,10 +2955,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinNoTranspose) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -2992,10 +2992,10 @@ TEST_P(AllocationDomainTest, BasicMatmulWithPrologueSetCastSinSetNoTranspose) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -3029,10 +3029,10 @@ TEST_P(AllocationDomainTest, MatmulWithPrologueSetCastSinTranspose) {
       ->schedule(fusion.get(), &mparams);
 
   auto [t0, t1] = getInputTensors(M, N, K, a_m_inner, b_k_inner);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.sin().to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
@@ -3069,10 +3069,310 @@ TEST_F(MatmulSchedulerTest, OperandOrderIssue2434) {
   auto y_ref = at::randn({N, K}, options);
   std::vector<c10::IValue> inputs{x_ref, y_ref};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
   auto tref = at::linear(x_ref.to(at::kFloat), y_ref.to(at::kFloat));
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
 
+TEST_F(MatmulSchedulerTest, HSH_TT) {
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto dtype = DataType::Half;
+  constexpr auto layout = MmaLayout::TT;
+
+  auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype); // A [M, K, b]
+  auto tv1 = makeContigConcreteTensor({1, -1, -1}, dtype); // B [b, K, N]
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = fusedMultiplySum(tv0, tv1, {1});
+
+  // Reorder the accumulator as [M, N, K]
+  // [M, rK, N] -> [M, N, K]
+  tv2->reorder({{-2, -1}, {-1, -2}});
+  tv2->commitLeafToLogical();
+
+  auto tv3 = castOp(DataType::Half, tv2);
+  fusion->addOutput(tv3);
+
+  NVF_CHECK(
+      1 == ir_utils::getOpsOfType<MmaOp>(fusion.get()).size(),
+      "matmul fusion must have at least one MmaOp");
+
+  // Create custom Matmul Params
+  MatMulTileOptions gemm_tile;
+  // TODO cta tile is a multiple of mma macro for hopper.
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+
+  // TODO warp tile is (macroM, macroN, macroK) for hopper.
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+
+  // TODO instruction tile is not used for hopper.
+  gemm_tile.instruction_tile = GemmTile(16, 8, 16);
+
+  MatmulParams mparams;
+  mparams.supported_vec_size = {8, 8, 4};
+
+  // TODO use hopper macro
+  // mparams.mma_macro = MmaMacro::Hopper_64_256_16;
+  mparams.mma_macro = MmaMacro::Ampere_16_8_16;
+
+  mparams.tile_sizes = gemm_tile;
+  mparams.async_gmem_load_operands = true;
+  mparams.circular_buffer_options.circular_buffer_smem_write = true;
+  mparams.circular_buffer_options.circular_buffer_smem_read = true;
+  mparams.circular_buffer_options.smem_circular_buffer_stage = 4;
+
+  // TODO Create prefetch parameter
+  // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3;
+
+  // Schedule matmul fusion using custom parameters
+  SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
+      ->schedule(fusion.get(), &mparams);
+
+  const int M = 32, N = 32, K = 256;
+  auto inputs =
+      matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype));
+
+  //! TODO Disabled because hopper multiple matmul scheduler is currently a copy
+  //! of ampere scheduler.
+  /*
+  KernelExecutor ke;
+  ke.compile(
+      fusion.get(),
+      {inputs.first, inputs.second},
+      LaunchParams(),
+      matmul_cparams);
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
+  auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout);
+  EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5));
+  */
+}
+
+TEST_F(MatmulSchedulerTest, HSH_TN) {
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto dtype = DataType::Half;
+  constexpr auto layout = MmaLayout::TN;
+
+  auto tv0 = makeContigConcreteTensor({-1, 1, -1}, dtype);
+  auto tv1 = makeContigConcreteTensor({1, -1, -1}, dtype);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // [M, b, K] x [b, N, K] -> [M, N, rK]
+  auto tv2 = fusedMultiplySum(tv0, tv1, {-1});
+
+  // [M, N]
+  auto tv3 = castOp(DataType::Half, tv2);
+  fusion->addOutput(tv3);
+
+  NVF_CHECK(
+      1 == ir_utils::getOpsOfType<MmaOp>(fusion.get()).size(),
+      "matmul fusion must have at least one MmaOp");
+
+  // Create custom Matmul Params
+  MatMulTileOptions gemm_tile;
+  // TODO cta tile is a multiple of mma macro for hopper.
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+
+  // TODO warp tile is (macroM, macroN, macroK) for hopper.
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+
+  // TODO instruction tile is not used for hopper.
+  gemm_tile.instruction_tile = GemmTile(16, 8, 16);
+
+  MatmulParams mparams;
+  mparams.supported_vec_size = {8, 8, 4};
+
+  // TODO use hopper macro
+  // mparams.mma_macro = MmaMacro::Hopper_64_256_16;
+  mparams.mma_macro = MmaMacro::Ampere_16_8_16;
+
+  mparams.tile_sizes = gemm_tile;
+  mparams.async_gmem_load_operands = true;
+  mparams.circular_buffer_options.circular_buffer_smem_write = true;
+  mparams.circular_buffer_options.circular_buffer_smem_read = true;
+  mparams.circular_buffer_options.smem_circular_buffer_stage = 4;
+
+  // TODO Create prefetch parameter
+  // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3;
+
+  // Schedule matmul fusion using custom parameters
+  SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
+      ->schedule(fusion.get(), &mparams);
+
+  const int M = 32, N = 32, K = 256;
+  auto inputs =
+      matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype));
+
+  KernelExecutor ke;
+  ke.compile(
+      fusion.get(),
+      {inputs.first, inputs.second},
+      LaunchParams(),
+      matmul_cparams);
+
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
+  auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout);
+  EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5));
+}
+
+TEST_F(MatmulSchedulerTest, HSH_NT) {
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto dtype = DataType::Half;
+  constexpr auto layout = MmaLayout::NT; // [K, M] x [K, N] -> [M, N]
+
+  auto tv0 = makeContigConcreteTensor({-1, -1, 1}, dtype);
+  auto tv1 = makeContigConcreteTensor({-1, 1, -1}, dtype);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = fusedMultiplySum(tv0, tv1, {0});
+
+  // Reorder the accumulator as [M, N, K]
+  // [K, M, N] -> [M, N, K]
+  tv2->reorder({{-3, -1}});
+  tv2->commitLeafToLogical();
+
+  auto tv3 = castOp(DataType::Half, tv2);
+
+  fusion->addOutput(tv3);
+
+  NVF_CHECK(
+      1 == ir_utils::getOpsOfType<MmaOp>(fusion.get()).size(),
+      "matmul fusion must have at least one MmaOp");
+
+  // Create custom Matmul Params
+  MatMulTileOptions gemm_tile;
+  // TODO cta tile is a multiple of mma macro for hopper.
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+
+  // TODO warp tile is (macroM, macroN, macroK) for hopper.
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+
+  // TODO instruction tile is not used for hopper.
+  gemm_tile.instruction_tile = GemmTile(16, 8, 16);
+
+  MatmulParams mparams;
+  mparams.supported_vec_size = {8, 8, 4};
+
+  // TODO use hopper macro
+  // mparams.mma_macro = MmaMacro::Hopper_64_256_16;
+  mparams.mma_macro = MmaMacro::Ampere_16_8_16;
+
+  mparams.tile_sizes = gemm_tile;
+  mparams.async_gmem_load_operands = true;
+  mparams.circular_buffer_options.circular_buffer_smem_write = true;
+  mparams.circular_buffer_options.circular_buffer_smem_read = true;
+  mparams.circular_buffer_options.smem_circular_buffer_stage = 4;
+
+  // TODO Create prefetch parameter
+  // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3;
+
+  // Schedule matmul fusion using custom parameters
+  SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
+      ->schedule(fusion.get(), &mparams);
+
+  const int M = 32, N = 32, K = 256;
+  auto inputs =
+      matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype));
+
+  KernelExecutor ke;
+  ke.compile(
+      fusion.get(),
+      {inputs.first, inputs.second},
+      LaunchParams(),
+      matmul_cparams);
+
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
+  auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout);
+  EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5));
+}
+
+TEST_F(MatmulSchedulerTest, HSH_NN) {
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto dtype = DataType::Half;
+  constexpr auto layout = MmaLayout::NN;
+
+  auto tv0 = makeContigConcreteTensor({1, -1, -1}, dtype); // A [b, K, M]
+  auto tv1 = makeContigConcreteTensor({-1, -1, 1}, dtype); // B [N, K, 1]
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = fusedMultiplySum(tv0, tv1, {1});
+
+  // Reorder the accumulator as [M, N, K]
+  // [N, rK, M] -> [M, N, K]
+  tv2->reorder({{-1, -3}});
+  tv2->commitLeafToLogical();
+
+  auto tv3 = castOp(DataType::Half, tv2);
+  fusion->addOutput(tv3);
+
+  NVF_CHECK(
+      1 == ir_utils::getOpsOfType<MmaOp>(fusion.get()).size(),
+      "matmul fusion must have at least one MmaOp");
+
+  // Create custom Matmul Params
+  MatMulTileOptions gemm_tile;
+  // TODO cta tile is a multiple of mma macro for hopper.
+  gemm_tile.cta_tile = GemmTile(128, 128, 32);
+
+  // TODO warp tile is (macroM, macroN, macroK) for hopper.
+  gemm_tile.warp_tile = GemmTile(64, 64, 32);
+
+  // TODO instruction tile is not used for hopper.
+  gemm_tile.instruction_tile = GemmTile(16, 8, 16);
+
+  MatmulParams mparams;
+  mparams.supported_vec_size = {8, 8, 4};
+
+  // TODO use hopper macro
+  // mparams.mma_macro = MmaMacro::Hopper_64_256_16;
+  mparams.mma_macro = MmaMacro::Ampere_16_8_16;
+
+  mparams.tile_sizes = gemm_tile;
+  mparams.async_gmem_load_operands = true;
+  mparams.circular_buffer_options.circular_buffer_smem_write = true;
+  mparams.circular_buffer_options.circular_buffer_smem_read = true;
+  mparams.circular_buffer_options.smem_circular_buffer_stage = 4;
+
+  // TODO Create prefetch parameter
+  // mparams.circular_buffer_options.smem_circular_buffer_prefetch = 3;
+
+  // Schedule matmul fusion using custom parameters
+  SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
+      ->schedule(fusion.get(), &mparams);
+
+  const int M = 32, N = 32, K = 256;
+  auto inputs =
+      matmulAtInput3DHopperSS(M, N, K, layout, data_type_to_aten(dtype));
+
+  // TODO Disabled because hopper multiple matmul scheduler is currently a copy
+  // of ampere scheduler.
+  /*
+  KernelExecutor ke;
+  ke.compile(
+      fusion.get(),
+      {inputs.first, inputs.second},
+      LaunchParams(),
+      matmul_cparams);
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
+  auto tref = atMatmul(inputs.first.squeeze(), inputs.second.squeeze(), layout);
+  EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-5, 1e-5));
+  */
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_mbarrier.cpp b/tests/cpp/test_mbarrier.cpp
index 84c58192271..f7f9611d895 100644
--- a/tests/cpp/test_mbarrier.cpp
+++ b/tests/cpp/test_mbarrier.cpp
@@ -46,9 +46,9 @@ TEST_F(MBarrierTest, Simple) {
   tv2->axis(0)->parallelize(ParallelType::TIDy);
   tv2->axis(1)->parallelize(ParallelType::TIDx);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
 
-  fe.registerPostLoweringHook([](kir::Kernel* kernel) {
+  ke.registerPostLoweringHook([](kir::Kernel* kernel) {
     // Replace block sync with mbarrier
     FusionGuard fg(kernel);
 
@@ -122,7 +122,7 @@ TEST_F(MBarrierTest, Simple) {
     top_level_exprs.push_back(invalidate);
   });
 
-  fe.compileFusion(&fusion);
+  ke.compile(&fusion);
 
   // Make sure that the post-lowering hook successfully inserted all mbarrier
   // operations
@@ -131,14 +131,14 @@ TEST_F(MBarrierTest, Simple) {
       &typeid(kir::MBarrierArrive),
       &typeid(kir::MBarrierWait),
       &typeid(kir::MBarrierInvalidate)};
-  for (auto expr : fe.kernel()->topLevelExprs()) {
+  for (auto expr : ke.kernel()->topLevelExprs()) {
     remaining_mbarrier_exprs.erase(&typeid(*expr));
   }
   EXPECT_TRUE(remaining_mbarrier_exprs.empty());
 
   auto input = at::randn(
       {32, 32}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
-  auto outputs = fe.runFusion({input});
+  auto outputs = ke.run({input});
 
   testValidate(&fusion, outputs, {input}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_memory.cpp b/tests/cpp/test_memory.cpp
index 5af4ecc4867..085a95640bc 100644
--- a/tests/cpp/test_memory.cpp
+++ b/tests/cpp/test_memory.cpp
@@ -78,15 +78,15 @@ TEST_P(MemoryTest, LoadCache) {
       {1024}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
   at::Tensor expected_output = input + 1.0f;
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   {
     DebugDumpOptionsGuard debug_dump_options_guard;
     DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx);
-    fe.compileFusion(&fusion, {input});
+    ke.compile(&fusion, {input});
   }
 
   // Verify PTX.
-  const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel();
+  const executor_utils::CompiledKernel& compiled_kernel = ke.compiledKernel();
   std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end());
   std::regex regex(R"(ld\.global\.)" + cache_op_str + R"(\.\S+)");
   std::smatch match;
@@ -98,7 +98,7 @@ TEST_P(MemoryTest, LoadCache) {
   std::filesystem::remove(compiled_kernel.ptx_filename);
 
   // Verify output tensors.
-  std::vector<at::Tensor> actual_ts = fe.runFusion({input});
+  std::vector<at::Tensor> actual_ts = ke.run({input});
   testValidate(
       &fusion, actual_ts, {input}, {expected_output}, __LINE__, __FILE__);
 }
@@ -153,15 +153,15 @@ TEST_F(MemoryTest, RefineCachePolicy) {
       {1024}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
   at::Tensor c = a + b;
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   {
     DebugDumpOptionsGuard debug_dump_options_guard;
     DebugDumpOptionsGuard::getCurOptions().set(DebugDumpOption::Ptx);
-    fe.compileFusion(&fusion, {a, b});
+    ke.compile(&fusion, {a, b});
   }
 
   // Verify PTX.
-  const executor_utils::CompiledKernel& compiled_kernel = fe.compiledKernel();
+  const executor_utils::CompiledKernel& compiled_kernel = ke.compiledKernel();
   std::string ptx(compiled_kernel.ptx.begin(), compiled_kernel.ptx.end());
   expectMatchCount(ptx, R"(ld\.global\.ca\.v4\.\S+)", 1);
   expectMatchCount(ptx, R"(ld\.global\.cs\.v4\.\S+)", 1);
@@ -170,7 +170,7 @@ TEST_F(MemoryTest, RefineCachePolicy) {
   debug() << "Removing " << compiled_kernel.ptx_filename << std::endl;
   std::filesystem::remove(compiled_kernel.ptx_filename);
 
-  std::vector<at::Tensor> actual_outputs = fe.runFusion({a, b});
+  std::vector<at::Tensor> actual_outputs = ke.run({a, b});
   testValidate(&fusion, actual_outputs, {a, b}, {c}, __LINE__, __FILE__);
 }
 
@@ -457,16 +457,16 @@ TEST_P(TMASimpleLdstTest, Load) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
   ASSERT_EQ(
-      XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None));
-  TMADimChecker::getDim(fe.kernel());
+      XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None));
+  TMADimChecker::getDim(ke.kernel());
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -533,10 +533,10 @@ TEST_P(TMALoadTestWithABroadcastDim, LoadWithBroadcast) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -577,15 +577,15 @@ TEST_P(TMASimpleLdstTest, Store) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), dim);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), dim);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
   ASSERT_EQ(
-      XorFinder::findXor(fe.kernel()), (swizzle != MmaInputSmemSwizzle::None));
+      XorFinder::findXor(ke.kernel()), (swizzle != MmaInputSmemSwizzle::None));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -639,13 +639,13 @@ TEST_F(TMAIndexingTest, Load2DTensorWith1DTMA) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1024, 1024}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -676,13 +676,13 @@ TEST_F(TMAIndexingTest, Load1DTensorWith2DTMA) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1024 * 1024}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -713,13 +713,13 @@ TEST_F(TMAIndexingTest, NonOneElementStride) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({1024, 1024}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -784,13 +784,13 @@ TEST_F(TMAIndexingTest, Advanced) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -833,13 +833,13 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing1) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({4, 32, 2, 8, 8, 8, 32, 8}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 4);
-  EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel()));
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 4);
+  EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel()));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -886,13 +886,13 @@ TEST_F(TMAIndexingTest, DefineBoxByCompositing2) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({32, 4, 2, 8, 8, 8, 2, 8, 4}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 5);
-  EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, fe.kernel()));
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 5);
+  EXPECT_FALSE(PredicatedChecker::isPredicated(tv1, ke.kernel()));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -947,13 +947,13 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation1) {
   int64_t multiple_of_16B_but_not_more = 4 * 67;
   auto t0 = at::randn(
       {prime_number, prime_number, multiple_of_16B_but_not_more}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -994,18 +994,18 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   int64_t multiple_of_8_but_not_more = 8 * 997;
   auto t0 = at::randn({multiple_of_8_but_not_more}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
   // We will be using 2D TMA instead of 1D, because strided box can not be
   // merged with other bulk axes by rotation. So, this schedule will be
   // interpreted as viewing then tensor as 2D (M/8, 8) and then applying 2D TMA.
   // The outer dim of TMA is defined by boxing and striding splits, and the
   // inner dim is defined as implicit whole.
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 
   // The tensor shape is not a multiple of 8, so the view should fail.
@@ -1016,7 +1016,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation2) {
                            .device(at::kCUDA, 0);
         int64_t prime_number = 997;
         auto t0 = at::randn({prime_number}, options);
-        auto cg_outputs = fe.runFusion({t0});
+        auto cg_outputs = ke.run({t0});
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("must be divisible by 8")));
@@ -1056,8 +1056,8 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   int64_t multiple_of_23 = 23 * 997;
   auto t0 = at::randn({multiple_of_23, 8}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
   // We will be using 3D TMA instead of 2D, because split(23, 8) is indivisible,
   // we can not consider this schedule as a 2D TMA whose first dimension has box
@@ -1065,10 +1065,10 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) {
   // TMA. The dim 0 of TMA is as implicit size-one, and the dim 1 is defined by
   // a boxing split whose box size is 8, and dim 2 is an implicit whole box with
   // size N.
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 
   // The tensor shape is not a multiple of 23, so the view should fail.
@@ -1079,7 +1079,7 @@ TEST_F(TMAIndexingTest, DefineBoxByRotation3) {
                            .device(at::kCUDA, 0);
         int64_t prime_number = 997;
         auto t0 = at::randn({prime_number, 8}, options);
-        auto cg_outputs = fe.runFusion({t0});
+        auto cg_outputs = ke.run({t0});
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("must be divisible by 23")));
@@ -1118,14 +1118,14 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain1) {
   auto t0 = at::randn({128, 1024 * 128}, options)
                 .transpose(0, 1)
                 .view({128, 1024, 128});
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
-  ASSERT_TRUE(XorFinder::findXor(fe.kernel()));
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
+  ASSERT_TRUE(XorFinder::findXor(ke.kernel()));
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -1173,13 +1173,13 @@ TEST_F(TMAIndexingTest, NonTrivialGmemAllocationDomain2) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({2, 3, 5, 7, 11, 32}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 3);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 3);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -1221,13 +1221,13 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationLoad) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({100000}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 4);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 4);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -1264,13 +1264,13 @@ TEST_F(TMAMiscTest, AdvancedThreadParallelizationStore) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({100000}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 4);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 4);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -1300,13 +1300,13 @@ TEST_F(TMAMiscTest, DisableIndexHoisting) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({32}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -1332,13 +1332,13 @@ TEST_F(TMAMiscTest, Repro1977) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({1024}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -1423,9 +1423,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) {
         std::count_if(flattened_exprs.begin(), flattened_exprs.end(), is_wait),
         1);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {input}, {}, matmul_cparams);
-    auto cg_outputs = fe.runFusion({input});
+    KernelExecutor ke;
+    ke.compile(&fusion, {input}, {}, matmul_cparams);
+    auto cg_outputs = ke.run({input});
     testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__);
   }
 
@@ -1475,9 +1475,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) {
     // RAW sync is inserted, the WAR pass has not run yet. We should be able to
     // remove the RAW sync by adding a cleanup pass.
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {input}, {}, matmul_cparams);
-    auto cg_outputs = fe.runFusion({input});
+    KernelExecutor ke;
+    ke.compile(&fusion, {input}, {}, matmul_cparams);
+    auto cg_outputs = ke.run({input});
     testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__);
   }
 
@@ -1542,9 +1542,9 @@ TEST_F(TMAMiscTest, StoreSyncInsertion) {
         std::count_if(flattened_exprs.begin(), flattened_exprs.end(), is_wait),
         2);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {input}, {}, matmul_cparams);
-    auto cg_outputs = fe.runFusion({input});
+    KernelExecutor ke;
+    ke.compile(&fusion, {input}, {}, matmul_cparams);
+    auto cg_outputs = ke.run({input});
     testValidate(&fusion, cg_outputs, {input}, {input}, __LINE__, __FILE__);
   }
 }
@@ -1586,12 +1586,12 @@ TEST_F(TMAMiscTest, LoadStrongCorrectness) {
   auto options =
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::arange(1, 33, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   auto expect = at::zeros({2, 1, 2, 16}, options);
   expect.flatten(0, 2).select(0, 0) = at::arange(1, 17, options);
@@ -1632,8 +1632,8 @@ TEST_F(TMACompileTimeInvalidTest, BulkNotInTMA) {
         auto options =
             at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
         auto t0 = at::randn({32}, options);
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "ParallelType::Bulk is only supported for cp.async.bulk.")));
@@ -1661,8 +1661,8 @@ TEST_F(TMACompileTimeInvalidTest, BulkBroadcast) {
         auto options =
             at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
         auto t0 = at::randn({32}, options);
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "ParallelType::Bulk is only supported for IterType::Iteration.")));
@@ -1689,8 +1689,8 @@ TEST_F(TMACompileTimeInvalidTest, InvalidParallelType) {
         auto options =
             at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
         auto t0 = at::randn({32}, options);
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Invalid parallel type for cp.async.bulk: V")));
@@ -1727,13 +1727,13 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0_aligned = at::randn({128 + items_of_16_bytes}, options)
                         .narrow(0, items_of_16_bytes, 128);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0_aligned}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0_aligned});
+  auto cg_outputs = ke.run({t0_aligned});
   testValidate(
       &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__);
 
@@ -1741,7 +1741,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalAddress) {
       [&]() {
         auto t0_misaligned = at::randn({128 + items_of_16_bytes / 2}, options)
                                  .narrow(0, items_of_16_bytes / 2, 128);
-        fe.runFusion({t0_misaligned});
+        ke.run({t0_misaligned});
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "globalAddress, which specifies the starting address of the memory region described, "
@@ -1782,13 +1782,13 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0_aligned =
       at::randn({128, 128 + items_of_16_bytes}, options).narrow(1, 0, 128);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0_aligned}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0_aligned}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0_aligned});
+  auto cg_outputs = ke.run({t0_aligned});
   testValidate(
       &fusion, cg_outputs, {t0_aligned}, {t0_aligned}, __LINE__, __FILE__);
 
@@ -1797,7 +1797,7 @@ TEST_F(TMARuntimeInvalidTest, MisalignedGlobalStride) {
         auto t0_misaligned =
             at::randn({128, 128 + items_of_16_bytes / 2}, options)
                 .narrow(1, 0, 128);
-        fe.runFusion({t0_misaligned});
+        ke.run({t0_misaligned});
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "globalStrides array, which specifies tensor stride of each of the lower tensorRank - 1 dimensions in bytes, "
@@ -1836,8 +1836,8 @@ TEST_F(TMACompileTimeInvalidTest, SizeOfTransfer) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "The expected bytes must be a multiple of 16 bytes, but 8 is not.")));
@@ -1876,18 +1876,18 @@ TEST_F(TMARuntimeInvalidTest, SizeOfTransfer) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({128}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, items_of_16_bytes}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 1);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 1);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0, items_of_16_bytes});
+  auto cg_outputs = ke.run({t0, items_of_16_bytes});
   testValidate(
       &fusion, cg_outputs, {t0, items_of_16_bytes}, {t0}, __LINE__, __FILE__);
 
   EXPECT_THAT(
-      [&]() { fe.runFusion({t0, items_of_16_bytes / 2}); },
+      [&]() { ke.run({t0, items_of_16_bytes / 2}); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "The expected bytes must be a multiple of 16 bytes, but ")));
 }
@@ -1929,19 +1929,19 @@ TEST_F(TMARuntimeInvalidTest, InvalidView) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   // (10240,) can be viewed as (10, 1024)
   auto t0_valid = at::randn({10240}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0_valid}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0_valid}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
 
-  auto cg_outputs = fe.runFusion({t0_valid});
+  auto cg_outputs = ke.run({t0_valid});
   testValidate(&fusion, cg_outputs, {t0_valid}, {t0_valid}, __LINE__, __FILE__);
 
   EXPECT_THAT(
       [&]() {
         // it is impossible to view (10249,) as (?, 1024)
         auto t0_inval = at::randn({10249}, options);
-        fe.runFusion({t0_inval});
+        ke.run({t0_inval});
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Invalid view in TMA: the extent of")));
@@ -1975,8 +1975,8 @@ TEST_F(TMACompileTimeInvalidTest, InnermostDiscontiguous) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "The innermost dimension of the TMA domain must be contiguous")));
@@ -2016,8 +2016,8 @@ TEST_F(TMACompileTimeInvalidTest, MergeDiscontiguous) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Can not merge discontiguous dimensions, but")));
@@ -2052,8 +2052,8 @@ TEST_F(TMACompileTimeInvalidTest, InnermostElementStrideNotOne) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "When interleave is CU_TENSOR_MAP_INTERLEAVE_NONE "
@@ -2091,8 +2091,8 @@ TEST_F(TMACompileTimeInvalidTest, SwizzleBulkWithNonBulk) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "TMA domain must be a view of the allocation domain of the gmem tensor")));
@@ -2135,8 +2135,8 @@ TEST_F(TMADocTest, Figure13a) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2173,13 +2173,13 @@ TEST_F(TMADocTest, Figure14a) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 200}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2214,8 +2214,8 @@ TEST_F(TMADocTest, Figure13b) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2249,13 +2249,13 @@ TEST_F(TMADocTest, Figure14b) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2291,8 +2291,8 @@ TEST_F(TMADocTest, Figure13c) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2327,13 +2327,13 @@ TEST_F(TMADocTest, Figure14c) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 200}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2366,8 +2366,8 @@ TEST_F(TMADocTest, Figure13d) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2398,13 +2398,13 @@ TEST_F(TMADocTest, Figure14d) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 12}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2441,8 +2441,8 @@ TEST_F(TMADocTest, Figure13e) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2478,13 +2478,13 @@ TEST_F(TMADocTest, Figure14e) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 1);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 1);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2523,13 +2523,13 @@ TEST_F(TMADocTest, Figure15a) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 10}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 0);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 0);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2565,13 +2565,13 @@ TEST_F(TMADocTest, Figure15b) {
       at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({16, 12}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  EXPECT_EQ(TMADimChecker::getDim(fe.kernel()), 2);
-  TMAPredicateChecker::checkPredicate(fe.kernel(), 4);
+  EXPECT_EQ(TMADimChecker::getDim(ke.kernel()), 2);
+  TMAPredicateChecker::checkPredicate(ke.kernel(), 4);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
@@ -2613,8 +2613,8 @@ TEST_F(TMADocTest, Figure15c) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2660,8 +2660,8 @@ TEST_F(TMADocTest, Figure15d) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2701,8 +2701,8 @@ TEST_F(TMADocTest, Figure15e) {
 
   EXPECT_THAT(
       [&]() {
-        FusionExecutor fe;
-        fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+        KernelExecutor ke;
+        ke.compile(&fusion, {t0}, {}, matmul_cparams);
       },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("Some error message")));
@@ -2755,9 +2755,9 @@ TEST_P(LdMatrixTest, Regular) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   auto t0 = at::randn({size1, getK(macro)}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams);
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -2881,9 +2881,9 @@ TEST_P(StMatrixSingleTileTest, Regular) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   auto t0 = at::randn({sizeM, sizeN}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams);
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -2942,9 +2942,9 @@ TEST_P(StMatrixTest, Regular) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   auto t0 = at::randn({sizeM, sizeN}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams);
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -3017,9 +3017,9 @@ TEST_P(LdMatrixTest, Transpose) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   auto t0 = at::randn({getK(macro), size2}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0}, LaunchParams(), matmul_cparams);
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_mma.cpp b/tests/cpp/test_mma.cpp
index 470a1633a9e..5493b4a19d5 100644
--- a/tests/cpp/test_mma.cpp
+++ b/tests/cpp/test_mma.cpp
@@ -172,10 +172,10 @@ std::vector<at::Tensor> scheduleCompileAndRun(
     tv2->setLoopDomain(s.as<IterDomain*>());
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  return fe.runFusion({inputs.first, inputs.second});
+  return ke.run({inputs.first, inputs.second});
 }
 
 TEST_P(MmaTest, SingleTile) {
@@ -388,11 +388,11 @@ TEST_P(HopperRS, SingleTile) {
   auto inputs = matmulAtInput3DHopperRS(
       getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype));
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.squeeze().to(at::kFloat),
       inputs.second.squeeze().to(at::kFloat),
@@ -484,11 +484,11 @@ TEST_P(HopperRS, SingleTileWithTMALoadStore) {
   auto inputs = matmulAtInput3DHopperRS(
       getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype));
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.squeeze().to(at::kFloat),
       inputs.second.squeeze().to(at::kFloat),
@@ -650,10 +650,10 @@ TEST_P(HopperSS, SingleTile) {
   auto inputs = matmulAtInput3DHopperSS(
       getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype));
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.squeeze().to(at::kFloat),
       inputs.second.squeeze().to(at::kFloat),
@@ -779,10 +779,10 @@ TEST_P(HopperSS, SingleTileTransposed) {
   auto inputs = matmulAtInput3DHopperSS(
       getM(macro), getN(macro), getK(macro), layout, data_type_to_aten(dtype));
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.squeeze().to(at::kFloat),
       inputs.second.squeeze().to(at::kFloat),
@@ -958,10 +958,10 @@ TEST_P(HopperSS, MultipleTile) {
       layout,
       data_type_to_aten(dtype));
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.squeeze().to(at::kFloat),
       inputs.second.squeeze().to(at::kFloat),
diff --git a/tests/cpp/test_move_pad.cpp b/tests/cpp/test_move_pad.cpp
index 4499ecb5ec5..92ccaeae676 100644
--- a/tests/cpp/test_move_pad.cpp
+++ b/tests/cpp/test_move_pad.cpp
@@ -41,13 +41,14 @@ TEST_F(MovePadTest, UnaryCat) {
   at::Tensor t1 = at::randn({2, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, BinaryCat) {
@@ -71,13 +72,14 @@ TEST_F(MovePadTest, BinaryCat) {
   at::Tensor t2 = at::randn({2, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, BinaryBroadcastOnNonCatDim) {
@@ -105,19 +107,20 @@ TEST_F(MovePadTest, BinaryBroadcastOnNonCatDim) {
   at::Tensor t2 = at::randn({4, 5}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
   // ensure that we propagate the pad across binary operation and the first
   // segment is no-op
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(
           HeuristicIs(SchedulerType::NoOp),
           HeuristicIs(SchedulerType::PointWise)));
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, BinaryBroadcastOnCatDim) {
@@ -144,13 +147,14 @@ TEST_F(MovePadTest, BinaryBroadcastOnCatDim) {
   at::Tensor t2 = at::randn({2, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, PadReplayOnMultipleUsesCase0) {
@@ -179,13 +183,14 @@ TEST_F(MovePadTest, PadReplayOnMultipleUsesCase0) {
   at::Tensor t1 = at::randn({1, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, PadReplayOnMultipleUsesCase1) {
@@ -215,10 +220,11 @@ TEST_F(MovePadTest, PadReplayOnMultipleUsesCase1) {
   at::Tensor t1 = at::randn({4, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, CascadePadCase0) {
@@ -264,15 +270,16 @@ TEST_F(MovePadTest, CascadePadCase0) {
   at::Tensor t0 = at::randn({4, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   Fusion* complete_fusion = runtime->fusionSegments()->completeFusion();
   std::vector<Expr*> exprs = complete_fusion->exprs();
   EXPECT_THAT(exprs, Contains(Property(&Expr::isA<PadOp>, IsTrue())).Times(1));
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, CascadePadCase1) {
@@ -302,15 +309,16 @@ TEST_F(MovePadTest, CascadePadCase1) {
   at::Tensor t0 = at::randn({4, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   Fusion* complete_fusion = runtime->fusionSegments()->completeFusion();
   std::vector<Expr*> exprs = complete_fusion->exprs();
   EXPECT_THAT(exprs, Contains(Property(&Expr::isA<PadOp>, IsTrue())).Times(2));
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, CascadePadCase2) {
@@ -359,10 +367,11 @@ TEST_F(MovePadTest, CascadePadCase2) {
   at::Tensor t0 = at::randn({4, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, NotMergeNegativePad) {
@@ -391,10 +400,11 @@ TEST_F(MovePadTest, NotMergeNegativePad) {
   at::Tensor t0 = at::randn({4, 10}, options);
   std::vector<c10::IValue> aten_inputs = {t0};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  testValidate(fec.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), out_tensors, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(MovePadTest, BooleanCat) {
@@ -418,18 +428,24 @@ TEST_F(MovePadTest, BooleanCat) {
   at::Tensor t2 = at::randn({2, 10}, options) > 0.5;
   std::vector<c10::IValue> aten_inputs = {t0, t1, t2};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1);
 
   // ExpressionEvaluator is hitting an assert with dynamic value.
-  // https://github.com/NVIDIA/Fuser/issues/2697 testValidate(fec.fusion(),
-  // out_tensors, aten_inputs, __LINE__, __FILE__);
+  // https://github.com/NVIDIA/Fuser/issues/2697
+  // testValidate(executor_cache.fusion(), out_tensors, aten_inputs, __LINE__,
+  // __FILE__);
   at::Tensor ref = at::cat({at::bitwise_and(t0, t1), t2}, 0);
   testValidate(
-      fec.fusion(), out_tensors, aten_inputs, {ref}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      out_tensors,
+      aten_inputs,
+      {ref},
+      __LINE__,
+      __FILE__);
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_move_split_cat.cpp b/tests/cpp/test_move_split_cat.cpp
index 247aa96381e..beec3172e2d 100644
--- a/tests/cpp/test_move_split_cat.cpp
+++ b/tests/cpp/test_move_split_cat.cpp
@@ -39,9 +39,10 @@ TEST_F(MoveSplitCatTest, Cancellable_SplitImmediatelyFollowedByCat) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -60,9 +61,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_DifferentOrder) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 6}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -83,9 +85,10 @@ TEST_F(MoveSplitCatTest, Cancellable_SetWithoutPermute) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 5}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -108,9 +111,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_SliceAmountAndPaddingAmountMismatch) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -132,9 +136,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_CatOnlySubsetOfSplitOutputs) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -158,9 +163,10 @@ TEST_F(MoveSplitCatTest, Cancellable_PermuteInBetween) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 3, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -193,12 +199,13 @@ TEST_F(MoveSplitCatTest, Cancellable_IncompatibleAllocationOrder) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 3, 5}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   // Check the two permutes are merged to one.
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   Fusion* complete_fusion = runtime->fusionSegments()->completeFusion();
   EXPECT_THAT(complete_fusion->exprs(), Contains(IsPermute()).Times(1));
 
@@ -232,9 +239,10 @@ TEST_F(MoveSplitCatTest, Cancellable_MultiplePermutesInBetween) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 3, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -258,9 +266,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_WrongAxis) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 2, 4}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -283,9 +292,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_SomeButNotAllArePermuted) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 2, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -311,9 +321,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_PermutedDifferently) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 2}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -338,9 +349,10 @@ TEST_F(MoveSplitCatTest, Noncancellable_UnsupportedOps) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 2, 4}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -364,9 +376,10 @@ TEST_F(MoveSplitCatTest, Cancellable_ReshapeInBetween) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -393,9 +406,10 @@ TEST_F(MoveSplitCatTest, Cancellable_ReshapeAndPermuteInBetween) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({6, 10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -445,9 +459,10 @@ TEST_F(MoveSplitCatTest, Cancellable_Issue1768) {
       at::randn({b * h * 3 * s * f}, options)
           .as_strided({b, h * 3, s, f}, {h * 3 * s * f, f, h * 3 * f, 1});
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor));
   EXPECT_TRUE(out_tensors[2].is_alias_of(in_tensor));
@@ -471,9 +486,10 @@ TEST_F(MoveSplitCatTest, OuterSplit) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 6}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
 }
@@ -514,11 +530,12 @@ TEST_F(MoveSplitCatTest, MultiplePairs) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 6}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   Fusion* complete_fusion = runtime->fusionSegments()->completeFusion();
   std::vector<Expr*> exprs = complete_fusion->exprs();
 
@@ -564,9 +581,10 @@ TEST_F(MoveSplitCatTest, MultipleCatsOnSameSplit) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 2}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_FALSE(out_tensors[0].is_alias_of(in_tensor));
   EXPECT_TRUE(out_tensors[1].is_alias_of(in_tensor));
diff --git a/tests/cpp/test_multidevice_lower_communication.cpp b/tests/cpp/test_multidevice_lower_communication.cpp
index 3c454777f0f..47f80f77d9c 100644
--- a/tests/cpp/test_multidevice_lower_communication.cpp
+++ b/tests/cpp/test_multidevice_lower_communication.cpp
@@ -17,9 +17,10 @@
 namespace nvfuser {
 
 namespace {
-void assertIsCompiledToHostIrContainer(const FusionExecutorCache& fec) {
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
-  const std::vector<FusionExecutor>& executors = runtime->executors();
+void assertIsCompiledToHostIrContainer(
+    const FusionExecutorCache& executor_cache) {
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  const std::vector<KernelExecutor>& executors = runtime->executors();
   EXPECT_THAT(executors, testing::SizeIs(1));
   for (const auto& executor : executors) {
     EXPECT_TRUE(executor.fusion()->isA<hir::HostIrContainer>())
@@ -71,9 +72,9 @@ TEST_P(LowerGatherTest, ) {
       at::randn({in_mesh.size(), kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   if (out_mesh.has(device_id)) {
     EXPECT_TRUE(at::equal(out_tensor, unsharded_tensor));
@@ -112,9 +113,10 @@ TEST_P(LowerScatterTest, ) {
   at::Tensor unsharded_tensor =
       at::randn({out_mesh.size(), kTensorSize}, tensor_options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({unsharded_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor =
+      executor_cache.runFusionWithInputs({unsharded_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   if (out_mesh.has(device_id)) {
     EXPECT_TRUE(at::equal(out_tensor, shardTensor(unsharded_tensor, out)));
@@ -155,9 +157,9 @@ TEST_P(LowerSendRecvTest, ) {
       at::randn({in_mesh.size(), kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   if (out_mesh.has(device_id)) {
     EXPECT_TRUE(at::equal(out_tensor, shardTensor(unsharded_tensor, out)));
@@ -194,9 +196,9 @@ TEST_F(LowerCollectiveTest, Allgather) {
       at::randn({num_devices, kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   EXPECT_TRUE(at::equal(out_tensor, unsharded_tensor));
 }
@@ -221,10 +223,10 @@ TEST_F(LowerCollectiveTest, Broadcast) {
   const auto device_id = communicator_->deviceId();
   at::Tensor in_tensor = unsharded_tensor.slice(0, device_id, device_id + 1);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
   if (num_devices > 1) {
-    assertIsCompiledToHostIrContainer(fec);
+    assertIsCompiledToHostIrContainer(executor_cache);
   }
 
   EXPECT_TRUE(
@@ -252,9 +254,9 @@ TEST_F(LowerCollectiveTest, Reduce) {
   const auto device_id = communicator_->deviceId();
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   if (device_id == kRoot) {
     // at::allclose instead of at::equal because addition is involved.
@@ -281,9 +283,9 @@ TEST_F(LowerCollectiveTest, Allreduce) {
       at::randn({num_devices, kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0)));
 }
@@ -309,10 +311,10 @@ TEST_F(LowerCollectiveTest, Allreduce_Concrete) {
       at::randn({num_devices, kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
   if (num_devices > 1) {
-    assertIsCompiledToHostIrContainer(fec);
+    assertIsCompiledToHostIrContainer(executor_cache);
   }
 
   EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0)));
@@ -338,9 +340,9 @@ TEST_F(LowerCollectiveTest, ReduceScatter) {
       at::randn({num_devices, num_devices, kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  assertIsCompiledToHostIrContainer(fec);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  assertIsCompiledToHostIrContainer(executor_cache);
 
   at::Tensor unsharded_out_tensor = unsharded_in_tensor.sum(0);
   EXPECT_TRUE(at::allclose(out_tensor, shardTensor(unsharded_out_tensor, out)));
@@ -371,8 +373,8 @@ TEST_F(LowerCollectiveTest, ReduceScatter_Allgather) {
       at::randn({num_devices, num_devices, kTensorSize}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
 
   EXPECT_TRUE(at::allclose(out_tensor, unsharded_in_tensor.sum(0)));
 }
diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp
index 3032db30b94..24e84f56e5e 100644
--- a/tests/cpp/test_multidevice_matmul.cpp
+++ b/tests/cpp/test_multidevice_matmul.cpp
@@ -102,12 +102,18 @@ TEST_F(DistributedMatmulTest, MulSum_LayoutTN_NoComms) {
   std::vector<c10::IValue> inputs = {shardTensor(in0, a), in1};
   auto expected_output = shardTensor(out, c);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(
-      fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      outputs,
+      inputs,
+      {expected_output},
+      __LINE__,
+      __FILE__);
 
-  const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime();
+  const FusionKernelRuntime* kernel_runtime =
+      executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       kernel_runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::Matmul)).Times(1));
@@ -156,13 +162,19 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutTN_NoComms) {
   std::vector<c10::IValue> inputs = {shardTensor(in0, a), in1};
   auto expected_output = shardTensor(out, c);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   testValidate(
-      fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      outputs,
+      inputs,
+      {expected_output},
+      __LINE__,
+      __FILE__);
 
-  const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime();
+  const FusionKernelRuntime* kernel_runtime =
+      executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       kernel_runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1));
@@ -208,13 +220,19 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutTN_Allgather) {
 
   std::vector<c10::IValue> inputs = {shardTensor(in0, a), in1};
   auto expected_output = shardTensor(out, c);
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   testValidate(
-      fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      outputs,
+      inputs,
+      {expected_output},
+      __LINE__,
+      __FILE__);
 
-  const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime();
+  const FusionKernelRuntime* kernel_runtime =
+      executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       kernel_runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1));
@@ -258,12 +276,14 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutNT_AllReduce) {
   in1 = in1.view({Ko, Ki, N});
   std::vector<c10::IValue> inputs = {shardTensor(in0, a), shardTensor(in1, b)};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
-  testValidate(fec.fusion(), outputs, inputs, {out}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), outputs, inputs, {out}, __LINE__, __FILE__);
 
-  const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime();
+  const FusionKernelRuntime* kernel_runtime =
+      executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       kernel_runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1));
@@ -315,12 +335,18 @@ TEST_F(DistributedMatmulTest, Matmul_LayoutNT_ReduceScatter) {
   std::vector<c10::IValue> inputs = {shardTensor(in0, a), shardTensor(in1, b)};
   auto expected_output = shardTensor(out, c).view({1, Mi, N});
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(
-      fec.fusion(), outputs, inputs, {expected_output}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      outputs,
+      inputs,
+      {expected_output},
+      __LINE__,
+      __FILE__);
 
-  const FusionKernelRuntime* kernel_runtime = fec.getMostRecentKernelRuntime();
+  const FusionKernelRuntime* kernel_runtime =
+      executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       kernel_runtime->fusionSegments()->groups(),
       Contains(HeuristicIs(SchedulerType::ExprEval)).Times(1));
@@ -354,16 +380,16 @@ TEST_F(DistributedMatmulTest, PresegPreservesSharding) {
   auto w_tensor = at::randn({mesh.size(), 36, 48}, tensor_options);
   auto sharded_w_tensor = shardTensor(w_tensor, w);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   std::vector<c10::IValue> inputs({x_tensor, sharded_w_tensor});
-  std::vector<at::Tensor> outputs = fec.runFusionWithInputs(inputs);
+  std::vector<at::Tensor> outputs = executor_cache.runFusionWithInputs(inputs);
 
   at::Tensor expected_mm_t_tensor =
       atMatmul(x_tensor, w_tensor.view({mesh.size() * 36, 48}), MmaLayout::TN)
           .transpose(0, 1)
           .view({mesh.size(), 36, 12});
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       outputs,
       inputs,
       {shardTensor(expected_mm_t_tensor, mm_t)},
@@ -394,13 +420,13 @@ TEST_F(DistributedMatmulTest, AnnotateWeightOnly) {
   auto w_tensor = at::randn({mesh.size(), 3, 5}, tensor_options);
   auto sharded_w_tensor = shardTensor(w_tensor, w);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   std::vector<c10::IValue> inputs({x_tensor, sharded_w_tensor});
-  std::vector<at::Tensor> outputs = fec.runFusionWithInputs(inputs);
+  std::vector<at::Tensor> outputs = executor_cache.runFusionWithInputs(inputs);
 
   at::Tensor expected_y_tensor = at::matmul(x_tensor, w_tensor);
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       outputs,
       inputs,
       {shardTensor(expected_y_tensor, 0, mesh)},
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index ece3a36b67a..1e1ff2eab9e 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -62,10 +62,16 @@ TEST_P(MultiDeviceReductionTest, UnshardedInput_ShardedOutput) {
   auto x1 = shardTensor(x0, tv1);
   auto x2 = x1 + x1;
   auto x3 = shardTensor(at::sum(x0 + x0, {sharded_input_dim}), tv3);
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
-  testValidate(fec.fusion(), outputs, inputs, {x1, x2, x3}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(),
+      outputs,
+      inputs,
+      {x1, x2, x3},
+      __LINE__,
+      __FILE__);
 }
 
 // Test multidevice fusion with sharded input and replicated intermediates and
@@ -98,9 +104,10 @@ TEST_P(MultiDeviceReductionTest, ShardedInput_ReplicatedOutput) {
   auto x1 = at::randn(unsharded_input_shape, tensor_options);
   std::vector<c10::IValue> inputs = {shardTensor(x1, tv0)};
   auto x2 = x1 * 2;
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
-  testValidate(fec.fusion(), outputs, inputs, {x1, x2}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(
+      executor_cache.fusion(), outputs, inputs, {x1, x2}, __LINE__, __FILE__);
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -137,10 +144,10 @@ TEST_F(MultiDeviceTest, Reduction) {
   auto unsharded_in_tensor = at::randn({mesh.size(), 4}, tensor_options);
   auto in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       out_tensors,
       {in_tensor},
       {unsharded_in_tensor.sum(0)},
@@ -172,10 +179,10 @@ TEST_F(MultiDeviceTest, Slice) {
   auto expected_out = aten_x.split(4, 2);
   std::vector<c10::IValue> inputs = {{shardTensor(aten_x, x)}};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       outputs,
       inputs,
       {shardTensor(expected_out[0], x), shardTensor(expected_out[1], x)},
@@ -206,8 +213,8 @@ TEST_F(MultiDeviceTest, BackpropMeshes) {
   at::Tensor unsharded_x_tensor = at::randn({num_devices, 4}, tensor_options);
   at::Tensor x_tensor = shardTensor(unsharded_x_tensor, x);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor z_tensor = fec.runFusionWithInputs({x_tensor})[0];
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor z_tensor = executor_cache.runFusionWithInputs({x_tensor})[0];
   EXPECT_THAT(z_tensor.sizes(), ElementsAre(1, 4))
       << "Due to sharding propagation, z is supposed to "
       << "be sharded in the same way as x.";
@@ -239,11 +246,11 @@ TEST_F(MultiDeviceTest, LayerNorm) {
   auto aten_outputs =
       at::native_layer_norm(aten_x, norm_shape, aten_weight, aten_bias, kEps);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs({aten_x});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({aten_x});
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       outputs,
       {aten_x},
       {std::get<0>(aten_outputs),
@@ -278,14 +285,14 @@ TEST_F(MultiDeviceTest, Issue2758) {
       at::zeros({num_devices, num_devices, 4}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
 
   at::Tensor expected_out_tensor =
       shardTensor(unsharded_in_tensor.sum(0), reduce_scattered) +
       in_tensor.size(1);
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       {out_tensor},
       {in_tensor},
       {expected_out_tensor},
@@ -314,20 +321,20 @@ TEST_F(MultiDeviceTest, Transpose) {
       at::randn({num_devices, 1024, 1024}, tensor_options);
   at::Tensor in_tensor = shardTensor(unsharded_in_tensor, in);
 
-  FusionExecutorCache fec(std::move(fusion));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
 
   at::Tensor expected_out_tensor =
       shardTensor(unsharded_in_tensor.transpose(1, 2), out);
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       {out_tensor},
       {in_tensor},
       {expected_out_tensor},
       __LINE__,
       __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose)));
@@ -365,11 +372,12 @@ TEST_P(MultiDeviceBroadcastTest, NotExpanded) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({1, 8}, options);
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_P(MultiDeviceBroadcastTest, Expanded) {
@@ -395,11 +403,12 @@ TEST_P(MultiDeviceBroadcastTest, Expanded) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({8}, options).as_strided({3, 8}, {0, 1});
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 INSTANTIATE_TEST_SUITE_P(, MultiDeviceBroadcastTest, testing::Bool());
diff --git a/tests/cpp/test_multidevice_transformer.cpp b/tests/cpp/test_multidevice_transformer.cpp
index f9d5d96c5da..f4b999e7aec 100644
--- a/tests/cpp/test_multidevice_transformer.cpp
+++ b/tests/cpp/test_multidevice_transformer.cpp
@@ -698,9 +698,9 @@ TEST_P(DistributedTransformerTest, MLP_Layer) {
       reference_outs[2],
       reference_outs[3]};
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::manual_seed(getATenRandomSeed());
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   validate(expected_outputs, outputs, {0.01, 0.01, 0.02, 0.02});
 }
 
@@ -785,9 +785,9 @@ TEST_P(DistributedTransformerTest, Sequence_Parallel_MLP_Layer) {
       shardTensor(reference_outs[2], 0, mesh),
       shardTensor(reference_outs[3], 0, mesh)};
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::manual_seed(getATenRandomSeed());
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   validate(expected_outputs, outputs, {0.01, 0.01, 0.02, 0.02});
 }
 
@@ -846,9 +846,9 @@ TEST_P(DistributedTransformerTest, MultiheadAttention) {
       reference_outs[2],
       reference_outs[3]};
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::manual_seed(getATenRandomSeed());
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   validate(expected_outputs, outputs, {0.02, 0.02, 0.02, 0.02});
 }
 
@@ -920,8 +920,8 @@ TEST_P(DistributedTransformerTest, MLP_Backward) {
       shardTensor(outs[5], 0, mesh), // linear0 bias grad
       outs[6]}; // linear0 grad x
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
 
   validate(expected_outputs, outputs, {1e-5, 0.2, 1e-5, 0.01, 0.2, 0.01, 0.02});
 }
@@ -1021,9 +1021,9 @@ TEST_P(DistributedTransformerTest, MHA_Backward) {
           .view({1, 3 * E / D}), // linear0 bias grad
       reference_outs[12]};
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::manual_seed(getATenRandomSeed());
-  auto out = fec.runFusionWithInputs(inputs);
+  auto out = executor_cache.runFusionWithInputs(inputs);
   validate(
       expected_outputs, out, {1e-5, 0.02, 1e-5, .01, .02, 0.2, 0.2, 0.2, 0.02});
 }
@@ -1146,9 +1146,9 @@ TEST_P(DistributedTransformerTest, Forward) {
   std::vector<at::Tensor> expected_outputs = {
       ln0_out_, mha_out_, ln1_out_, mlp_out_, at_out};
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::manual_seed(getATenRandomSeed());
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   validate(expected_outputs, outputs, {1e-4, 0.02, 0.04, 0.04, 0.04});
 }
 
@@ -1430,9 +1430,9 @@ TEST_P(DistributedTransformerTest, Backward) {
       shardTensor(mlp_out_[0], 1, mesh) // mlp linear1
   };
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::manual_seed(getATenRandomSeed());
-  auto outputs = fec.runFusionWithInputs(inputs);
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
   validate(
       expected_outputs,
       outputs,
diff --git a/tests/cpp/test_no_op.cpp b/tests/cpp/test_no_op.cpp
index a6e35e9b9ac..0b0e093767a 100644
--- a/tests/cpp/test_no_op.cpp
+++ b/tests/cpp/test_no_op.cpp
@@ -186,10 +186,11 @@ TEST_F(NoOpTest, View) {
   TensorView* out = reshape(in, in_shape, out_shape);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor =
       at::randn({2, 3, 4}, at::dtype(at::kFloat).device(at::kCUDA, 0));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   at::Tensor out_tensor = out_tensors[0];
 
@@ -198,7 +199,7 @@ TEST_F(NoOpTest, View) {
 
   // Verify the NoOp scheduler was kicked in.
   const std::vector<SegmentedGroup*>& groups =
-      fec.getMostRecentKernelRuntime()->fusionSegments()->groups();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
   ASSERT_EQ(groups.size(), 1);
   SegmentedGroup* group = groups[0];
   EXPECT_EQ(group->schedulerType(), SchedulerType::NoOp);
@@ -220,12 +221,13 @@ TEST_F(NoOpTest, ExpandedReduction) {
   out = segment_set(out);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::ones({}).cuda().as_strided({2, 3}, {0, 0});
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(
       runtime->fusionSegments()->groups(),
       UnorderedElementsAre(HeuristicIs(SchedulerType::NoOp)));
diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp
index 529423145dc..3463395b11b 100644
--- a/tests/cpp/test_persistent_buffer.cpp
+++ b/tests/cpp/test_persistent_buffer.cpp
@@ -343,8 +343,8 @@ TEST_F(PersistentBufferTest, FusionPersistentBufferProjection_CUDA) {
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   at::Tensor aten_t0 = at::randn({99, 101}, options);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({aten_t0});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({aten_t0});
 
   testValidate(&fusion, cg_outputs, {aten_t0}, __LINE__, __FILE__);
 }
@@ -611,8 +611,8 @@ TEST_F(PersistentBufferTest, FusionLayerNormFusedOpsRedundantCast_CUDA) {
           hidden_size * dataTypeSize(dtype),
       "Persistent buffer size is not correct!");
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__);
 }
 
@@ -679,8 +679,8 @@ TEST_F(PersistentBufferTest, FusionRecomputePersistentBuffer_CUDA) {
       persistent_buffer_info2.persistent_buffers.size() == 1,
       "After project to other buffers, should have one persistent buffer!");
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__);
 }
 
@@ -1172,10 +1172,10 @@ TEST_F(PersistentBufferTest, PostReductionBroadcastCheck) {
   auto t1 = at::randn({dim0, dim1}, options);
   auto t2 = at::sum(t0, {1}).unsqueeze(1) + t0;
   auto t4 = t2 + t1;
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
   NVF_CHECK(
-      !fec.getMostRecentKernelRuntime()->isSegmented(),
+      !executor_cache.getMostRecentKernelRuntime()->isSegmented(),
       "unexpected segmentation!");
 
   testValidate(fusion, cg_outputs, {t0, t1}, {t4}, __LINE__, __FILE__);
@@ -1211,10 +1211,10 @@ TEST_F(PersistentBufferTest, PostReductionBroadcastCheckMultiBcastDims) {
   auto t1 = at::randn({dim0, dim1, dim2}, options);
   auto t2 = at::sum(t0, {1, 2}).unsqueeze(-1).unsqueeze(-1) + t0;
   auto t4 = t2 + t1;
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
   NVF_CHECK(
-      !fec.getMostRecentKernelRuntime()->isSegmented(),
+      !executor_cache.getMostRecentKernelRuntime()->isSegmented(),
       "unexpected segmentation!");
 
   testValidate(fusion, cg_outputs, {t0, t1}, {t4}, __LINE__, __FILE__);
@@ -1243,15 +1243,16 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) {
                      .device(at::kCUDA, 0);
   auto t0 = at::randn(input_shape, options);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   std::vector<c10::IValue> aten_inputs = {t0};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   // should be segmented since buffer size is larger than 32K and smem
   // persistent is not supported yet for 3D reduction.
-  EXPECT_TRUE(fec.getMostRecentKernelRuntime()->isSegmented());
+  EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented());
 
-  testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
 
 TEST_F(PersistentBufferTest, SmemPersistent2DReduction) {
@@ -1297,10 +1298,10 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) {
   scheduler->schedule(fusion.get(), heuristic_params.get());
 
   // Run the fusion and validate the results
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), aten_inputs);
+  KernelExecutor ke;
+  ke.compile(fusion.get(), aten_inputs);
   // Shared memory access should be vectorized.
-  // getBankConflictInfo(fe.kernel()) triggers error "std::get: wrong index for
+  // getBankConflictInfo(ke.kernel()) triggers error "std::get: wrong index for
   // variant" when trying to evaluate index with:
   // `expr_eval.evaluate(ti->index()).as<int64_t>();`
   for (auto tv : fusion->allTvs()) {
@@ -1313,8 +1314,8 @@ TEST_F(PersistentBufferTest, SmemPersistent2DReduction) {
       }
     }
   }
-  auto cg_outputs = fe.runFusion(
-      aten_inputs, heuristic_params->as<ReductionParams>()->lparams);
+  auto cg_outputs =
+      ke.run(aten_inputs, heuristic_params->as<ReductionParams>()->lparams);
   auto t1 = t0 / t0.sum({1, 2, 3}, true);
   testValidate(fusion.get(), cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_pointwise.cpp b/tests/cpp/test_pointwise.cpp
index 552cb18f3a8..c4684adbb6e 100644
--- a/tests/cpp/test_pointwise.cpp
+++ b/tests/cpp/test_pointwise.cpp
@@ -23,8 +23,8 @@ using PointwiseTest = NVFuserTest;
 
 namespace {
 
-int64_t getVecSizeForPointwise(const FusionExecutorCache& fec) {
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+int64_t getVecSizeForPointwise(const FusionExecutorCache& executor_cache) {
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   NVF_CHECK(!runtime->isSegmented());
   const PointwiseParams* params = runtime->schedulerHeuristics()
                                       ->heuristicsList()
@@ -62,7 +62,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity2D) {
   auto tv1 = add(tv0, tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {18, 2}, {32, 4}};
 
@@ -71,9 +71,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity2D) {
     auto vec = pair.second;
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::Tensor input0 = at::randn({1000000, size}, options).narrow(1, 0, 16);
-    auto cg_outputs = fec.runFusionWithInputs({input0});
+    auto cg_outputs = executor_cache.runFusionWithInputs({input0});
 
-    EXPECT_EQ(getVecSizeForPointwise(fec), vec);
+    EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec);
 
     testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__);
   }
@@ -90,7 +90,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity3D) {
   auto tv1 = add(tv0, tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {10, 2}, {16, 4}};
 
@@ -99,9 +99,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity3D) {
     auto vec = pair.second;
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::Tensor input0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8);
-    auto cg_outputs = fec.runFusionWithInputs({input0});
+    auto cg_outputs = executor_cache.runFusionWithInputs({input0});
 
-    EXPECT_EQ(getVecSizeForPointwise(fec), vec);
+    EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec);
 
     testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__);
   }
@@ -120,7 +120,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity5D) {
   auto tv1 = add(tv0, tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
@@ -134,9 +134,9 @@ TEST_F(PointwiseTest, VectorizeStrideContiguity5D) {
     at::Tensor input0 = at::randn({4, size1, 12345, size2, 3}, options)
                             .narrow(1, 0, 8)
                             .narrow(3, 0, 4);
-    auto cg_outputs = fec.runFusionWithInputs({input0});
+    auto cg_outputs = executor_cache.runFusionWithInputs({input0});
 
-    EXPECT_EQ(getVecSizeForPointwise(fec), vec);
+    EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec);
 
     testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__);
   }
@@ -158,7 +158,7 @@ TEST_F(PointwiseTest, VectorizeStrideMisalignedBase) {
   auto tv1 = add(tv0, tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
@@ -195,8 +195,8 @@ TEST_F(PointwiseTest, VectorizeStrideMisalignedBase) {
     at::Tensor flat = at::randn({alloc_size}, options);
     at::Tensor input0 =
         flat.as_strided(shape, stride, /*storage_offset=*/align);
-    auto cg_outputs = fec.runFusionWithInputs({input0});
-    EXPECT_EQ(getVecSizeForPointwise(fec), vec);
+    auto cg_outputs = executor_cache.runFusionWithInputs({input0});
+    EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec);
     testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__);
   }
 }
@@ -214,7 +214,7 @@ TEST_F(PointwiseTest, VectorizeStrideContiguitySelfOverlapping) {
   auto tv1 = add(tv0, tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
@@ -241,8 +241,8 @@ TEST_F(PointwiseTest, VectorizeStrideContiguitySelfOverlapping) {
         stride1, (int64_t)stride2 * 12345, (int64_t)stride2, 3, 1};
     at::Tensor input0 = at::empty_strided(shape, stride, options);
     input0.random_();
-    auto cg_outputs = fec.runFusionWithInputs({input0});
-    EXPECT_EQ(getVecSizeForPointwise(fec), vec);
+    auto cg_outputs = executor_cache.runFusionWithInputs({input0});
+    EXPECT_EQ(getVecSizeForPointwise(executor_cache), vec);
     testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__);
   }
 }
@@ -262,13 +262,13 @@ TEST_F(PointwiseTest, VectorizeAllocationDomain) {
   tv1->setAllocationDomain({tv1->axis(0), tv1->axis(2), tv1->axis(1)}, true);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input0 =
       at::empty_strided({1024, 128, 25}, {128 * 25, 1, 128}, options);
-  auto cg_outputs = fec.runFusionWithInputs({input0});
-  EXPECT_EQ(getVecSizeForPointwise(fec), 4);
+  auto cg_outputs = executor_cache.runFusionWithInputs({input0});
+  EXPECT_EQ(getVecSizeForPointwise(executor_cache), 4);
   testValidate(fusion, cg_outputs, {input0}, __LINE__, __FILE__);
 }
 
@@ -407,7 +407,7 @@ TEST_F(PointwiseTest, Issue1567VectorizationFactorAnalysisCase2) {
   auto tv3 = transpose(tv2, 0, 1);
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input0 = at::randn({1024, 1, 2}, options);
@@ -444,7 +444,7 @@ TEST_F(PointwiseTest, VIssue1567ectorizationFactorAnalysisCase3) {
   auto tv3 = transpose(tv2, 0, 1);
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input0 = at::randn({1, 1024, 2}, options);
@@ -549,9 +549,9 @@ TEST_F(PointwiseTest, ShardedPointwise) {
         unsharded_pparams->flip_grid_binding);
 
     pwise_scheduler->schedule(&sharded_fusion, sharded_params.get());
-    FusionExecutor fe;
-    fe.compileFusion(&sharded_fusion, sharded_inputs, sharded_params->lparams);
-    auto cg_outputs = fe.runFusion(sharded_inputs, sharded_params->lparams);
+    KernelExecutor ke;
+    ke.compile(&sharded_fusion, sharded_inputs, sharded_params->lparams);
+    auto cg_outputs = ke.run(sharded_inputs, sharded_params->lparams);
     testValidate(
         &sharded_fusion, cg_outputs, sharded_inputs, __LINE__, __FILE__);
   }
@@ -659,11 +659,12 @@ TEST_F(PointwiseTest, VectorizeWithExpandedBroadcast) {
   auto in_tensor =
       at::randn({kTensorSize}, options).as_strided({2, kTensorSize}, {0, 1});
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  EXPECT_GT(getVecSizeForPointwise(fec), 1);
+  EXPECT_GT(getVecSizeForPointwise(executor_cache), 1);
 }
 
 using VectUnrollFactors = std::tuple<int64_t, int64_t, int64_t>;
@@ -705,10 +706,10 @@ TEST_P(PointwiseParamsTest, UnrollOnTopOfVectorize) {
 
   // Schedule, compile, run, validate
   scheduler_instance->schedule(fusion.get(), pparams);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), runtime_inputs, pparams->lparams);
-  auto cg_outputs = fe.runFusion(runtime_inputs, pparams->lparams);
-  const auto& lparams = fe.lastLaunchParams();
+  KernelExecutor ke;
+  ke.compile(fusion.get(), runtime_inputs, pparams->lparams);
+  auto cg_outputs = ke.run(runtime_inputs, pparams->lparams);
+  const auto& lparams = ke.lastLaunchParams();
   ASSERT_EQ(lparams.gdimy(), dim0 / unroll_outer);
   ASSERT_EQ(
       lparams.gdimx(), dim1 / vect_factor / lparams.bdimx() / unroll_inner);
diff --git a/tests/cpp/test_predicate_elimination.cpp b/tests/cpp/test_predicate_elimination.cpp
index 8b941f7e0e6..bfb12a12b8a 100644
--- a/tests/cpp/test_predicate_elimination.cpp
+++ b/tests/cpp/test_predicate_elimination.cpp
@@ -77,9 +77,9 @@ TEST_F(PredicateEliminationTest, 2) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn(shape, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   auto ref = (t0 + 1).sum({1}) + 1;
 
@@ -127,9 +127,9 @@ TEST_F(PredicateEliminationTest, 3) {
   for (auto size : {1, 2, 999, 1001, 1234, 10000}) {
     auto t0 = at::randn({size}, options);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
 
     auto ref = sum(t0) + 1;
     testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
@@ -180,9 +180,9 @@ TEST_F(PredicateEliminationTest, 4) {
     for (auto s1 : sizes) {
       auto t0 = at::randn({s0, s1}, options);
 
-      FusionExecutor fe;
-      fe.compileFusion(&fusion, {t0});
-      auto cg_outputs = fe.runFusion({t0});
+      KernelExecutor ke;
+      ke.compile(&fusion, {t0});
+      auto cg_outputs = ke.run({t0});
 
       auto t1 = t0.sum({1});
       auto t3 = t1.sum({0}) + 1;
@@ -228,9 +228,9 @@ TEST_F(PredicateEliminationTest, 5) {
   for (auto s0 : sizes) {
     auto t0 = at::randn({s0}, options);
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t0});
+    auto cg_outputs = ke.run({t0});
 
     auto ref = t0.mean({0});
 
@@ -277,9 +277,9 @@ TEST_F(PredicateEliminationTest, 6) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({2, 3}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -313,9 +313,9 @@ TEST_F(PredicateEliminationTest, 7) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({123}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -382,12 +382,12 @@ TEST_F(PredicateEliminationTest, 8) {
   at::Tensor aten_t3 = at::randn(full_size, options); // tv0 - 3
   at::Tensor aten_t4 = at::randn({channel_size}, options); // tv4 - 4
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs =
-      fec.runFusionWithInputs({aten_t0, aten_t1, aten_t2, aten_t3, aten_t4});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(
+      {aten_t0, aten_t1, aten_t2, aten_t3, aten_t4});
 
   const auto& compiled_executors =
-      fec.getMostRecentKernelRuntime()->executors();
+      executor_cache.getMostRecentKernelRuntime()->executors();
   NVF_CHECK(compiled_executors.size() == 1, "Unexpected scheduling");
   NVF_CHECK(
       !PredicatedChecker::isPredicated(tv6, compiled_executors.at(0).kernel()),
@@ -431,9 +431,9 @@ TEST_F(PredicateEliminationTest, 9) {
   //  with TIDx in this tensor
   EXPECT_TRUE(PredicatedChecker::isPredicated(tv1, gpulw));
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
@@ -470,16 +470,16 @@ TEST_F(PredicateEliminationTest, ExtentEqualToMaxParallelTypeExtent) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({10 * 32}, options);
-  FusionExecutor fe;
-  fe.registerLoweringHook([&](GpuLower* lower) {
+  KernelExecutor ke;
+  ke.registerLoweringHook([&](GpuLower* lower) {
     lower->passes().insert(
         lower->passes().begin(),
         {"validate_smem_predicate_elimination",
          validate_smem_predicate_elimination});
   });
-  fe.compileFusion(&fusion, {t0}, {}, matmul_cparams);
+  ke.compile(&fusion, {t0}, {}, matmul_cparams);
 
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   testValidate(&fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
 }
 
diff --git a/tests/cpp/test_preseg_passes.cpp b/tests/cpp/test_preseg_passes.cpp
index ca554ca0532..1a34d693c53 100644
--- a/tests/cpp/test_preseg_passes.cpp
+++ b/tests/cpp/test_preseg_passes.cpp
@@ -635,11 +635,12 @@ TEST_F(PresegTest, ReplaceOutput) {
   TensorView* y = add(x, x);
   fusion->replaceOutput(x, y);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({10}, at::device(at::kCUDA));
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
 
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(PresegTest, ExtentSubstitution) {
diff --git a/tests/cpp/test_replay.cpp b/tests/cpp/test_replay.cpp
index 1bb5c460bef..76f1271907a 100644
--- a/tests/cpp/test_replay.cpp
+++ b/tests/cpp/test_replay.cpp
@@ -46,8 +46,9 @@ TEST_F(ReplayTest, HorizontallyMergeReshapeAndPermute) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 5}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   auto out_tensor = out_tensors[0];
 
@@ -85,8 +86,9 @@ TEST_F(ReplayTest, HorizontallyMergeReshapeAndNeg) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({4, 5}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   ASSERT_EQ(out_tensors.size(), 1);
   auto out_tensor = out_tensors[0];
 
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index cc7fc96d8cd..ed02e67ee5f 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -63,9 +63,9 @@ TEST_P(ResizeTest, Pad1) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {1, 1});
 
@@ -99,9 +99,9 @@ TEST_P(ResizeTest, Pad2) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {1, 1});
 
@@ -152,9 +152,9 @@ TEST_P(ResizeTest, Pad3) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -186,9 +186,9 @@ TEST_P(ResizeTest, Pad4) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {1, 1});
 
@@ -241,9 +241,9 @@ TEST_P(ResizeTest, Pad5) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {1, 1});
 
@@ -292,9 +292,9 @@ TEST_P(ResizeTest, Pad6) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -343,9 +343,9 @@ TEST_P(ResizeTest, Pad7) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -388,9 +388,9 @@ TEST_F(ResizeTest, Pad8) {
   auto t0 = at::randn(999, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {0, 1}) + at::pad(t0, {1, 0});
 
@@ -613,9 +613,9 @@ TEST_F(ResizeTest, Cat1) {
   auto t1 = at::randn(shape1, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({t0, t1}, 0);
 
@@ -645,9 +645,9 @@ TEST_F(ResizeTest, Cat2) {
   auto t1 = at::randn(shape1, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({t0, t1}, 0);
 
@@ -686,9 +686,9 @@ TEST_F(ResizeTest, Cat3) {
   auto t1 = at::randn(shape1, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({t0, t1}, 1);
 
@@ -730,9 +730,9 @@ TEST_F(ResizeTest, Cat4) {
   auto t1 = at::randn(shape1, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({t0, t1}, 1);
 
@@ -779,9 +779,9 @@ TEST_F(ResizeTest, Cat5) {
   auto t2 = at::randn(shape2, options);
   std::vector<c10::IValue> aten_inputs({t0, t1, t2});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -823,9 +823,9 @@ TEST_F(ResizeTest, Cat6) {
   auto t2 = at::randn(shape2, options);
   std::vector<c10::IValue> aten_inputs({t0, t1, t2});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({t0, t1, t2}, 0);
 
@@ -879,9 +879,9 @@ TEST_F(ResizeTest, Cat7) {
     std::vector<c10::IValue> aten_inputs_ivalue(
         {aten_inputs.begin(), aten_inputs.end()});
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, aten_inputs_ivalue);
-    auto cg_outputs = fe.runFusion(aten_inputs_ivalue);
+    KernelExecutor ke;
+    ke.compile(&fusion, aten_inputs_ivalue);
+    auto cg_outputs = ke.run(aten_inputs_ivalue);
 
     auto ref = at::cat(aten_inputs, concat_dim);
 
@@ -1013,9 +1013,9 @@ TEST_F(ResizeTest, Slice1) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)});
 
@@ -1044,9 +1044,9 @@ TEST_F(ResizeTest, Slice2) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1142,9 +1142,9 @@ TEST_F(ResizeTest, Slice4) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = (t0 + 1).to(at::kDouble).sum({1});
 
@@ -1197,9 +1197,9 @@ TEST_F(ResizeTest, Slice5) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto t1 = t0.index(
       {at::indexing::Slice(0, at::indexing::None),
@@ -1249,9 +1249,9 @@ TEST_F(ResizeTest, SliceConstantShmoo) {
     auto t0 = at::randn(shape, options);
     std::vector<c10::IValue> aten_inputs({t0});
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, aten_inputs);
-    auto cg_outputs = fe.runFusion(aten_inputs);
+    KernelExecutor ke;
+    ke.compile(&fusion, aten_inputs);
+    auto cg_outputs = ke.run(aten_inputs);
 
     testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
   }
@@ -1294,13 +1294,13 @@ TEST_F(ResizeTest, SliceInputShmoo) {
         !fusion.hasDynamicTransform(), "Expected to have no dynamic transform");
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto t0 = at::randn(shape, options);
   for (auto [start, stop] : slice_cases) {
     std::vector<c10::IValue> aten_inputs({t0, start, stop});
-    auto cg_outputs = fe.runFusion(aten_inputs);
+    auto cg_outputs = ke.run(aten_inputs);
 
     testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
   }
@@ -1328,14 +1328,15 @@ TEST_F(ResizeTest, SliceInputShmooFusionExecutorCache) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto t0 = at::randn(shape, options);
   for (auto [start, stop] : slice_cases) {
     std::vector<c10::IValue> aten_inputs({t0, start, stop});
-    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+    auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-    testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
+    testValidate(
+        executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
   }
 }
 
@@ -1755,9 +1756,9 @@ TEST_P(ResizeTest, PadWithValue) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {1, 1}, "constant", 2);
 
@@ -1830,9 +1831,9 @@ TEST_P(ResizeTest, PadHalfWithDoubleValue) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0, {1, 1}, "constant", 2.5);
 
@@ -2186,7 +2187,7 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplitSchedule) {
 
   FusionExecutorCache executor_cache(std::move(fusion));
   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-  FusionExecutor fe;
+  KernelExecutor ke;
 
   auto ref0 = t0.index({at::indexing::Slice(0, 2)});
   auto ref1 = t0.index({at::indexing::Slice(2, 4)});
@@ -2228,15 +2229,15 @@ TEST_F(ResizeTest, FusionSizeZeroSliceSplit) {
   tv1->merge(0, 1); // size 0*5 = 0
   tv1->split(0, 4); // sizes (0, 4)
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
+  KernelExecutor ke;
+  ke.compile(fusion.get());
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref0 = t0.index({at::indexing::Slice(2, 2), at::indexing::Slice(0, 5)});
 
@@ -2267,7 +2268,7 @@ TEST_F(ResizeTest, FusionSqueezeSymbolic) {
   // tv1 is of shape {0, 5}
   fusion->addOutput(tv2);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::manual_seed(0);
@@ -2275,14 +2276,14 @@ TEST_F(ResizeTest, FusionSqueezeSymbolic) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0, 20});
 
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   auto ref0 = t0.flatten();
 
   NVF_CHECK(ref0.equal(cg_outputs[0]));
 
   EXPECT_THAT(
-      [&]() { fec.runFusionWithInputs({t0, 10}); },
+      [&]() { executor_cache.runFusionWithInputs({t0, 10}); },
       ThrowsMessage<nvfError>(
           HasSubstr("must concretize to IterType::Broadcast but found")));
 }
@@ -2680,9 +2681,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref =
       t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)});
@@ -2733,9 +2734,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref_t1 =
       t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)});
@@ -2784,9 +2785,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref =
       t0.index({at::indexing::Slice(slice_offset, shape[0] - slice_offset)});
@@ -2823,9 +2824,9 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) {
   auto t0_unaligned = at::randn(shape, options);
   auto t0_aligned = t0_unaligned.index({at::indexing::Slice(3, -1)});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0_aligned});
-  auto cg_outputs = fe.runFusion({t0_aligned});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0_aligned});
+  auto cg_outputs = ke.run({t0_aligned});
 
   auto ref_aligned = t0_aligned.index({at::indexing::Slice(1, -3)});
 
@@ -2867,9 +2868,9 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = t0.index(
       {at::indexing::Slice(slice_offset, shape[0] - slice_offset),
@@ -2917,11 +2918,11 @@ TEST_F(ResizeTest, Slice3DVectorizeManual1) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   EXPECT_THAT(
-      [&]() { fe.runFusion(aten_inputs); },
+      [&]() { ke.run(aten_inputs); },
       ThrowsMessage<nvfError>(
           HasSubstr("with word size 2 not possible due to invalid stride")));
 }
@@ -2960,11 +2961,11 @@ TEST_F(ResizeTest, Slice3DVectorizeManual2) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
   EXPECT_THAT(
-      [&]() { fe.runFusion(aten_inputs); },
+      [&]() { ke.run(aten_inputs); },
       ThrowsMessage<nvfError>(
           HasSubstr("with word size 4 not possible due to invalid stride")));
 }
@@ -3041,9 +3042,9 @@ TEST_F(ResizeTest, SliceAndReshapeRepro540Manual) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   for (const auto i : c10::irange(3)) {
     auto slice_out_ref = t0.index(
@@ -3086,23 +3087,23 @@ TEST_P(ResizeTest, ReshapeToPad) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at_x = at::randn({4, 3}, options);
   std::vector<c10::IValue> aten_inputs = {at_x, 1, 1, 3, 4};
   auto at_y = at::pad(at_x.reshape({3, 4}), {0, 1, 0, 1});
 
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   // Assert that we segmented into two segments
   auto seg_fusion =
-      fusion_executor_cache.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
   EXPECT_TRUE(seg_fusion->isSegmented());
   EXPECT_EQ(seg_fusion->groups().size(), 2);
 
   testValidate(
-      fusion_executor_cache.fusion(),
+      executor_cache.fusion(),
       outputs,
       aten_inputs,
       {at_y},
@@ -3131,23 +3132,23 @@ TEST_F(ResizeTest, ReshapeToSlice) {
   auto tv2 = slice(tv1, {{fusion.zeroVal(), s0}, {fusion.zeroVal(), s1}});
   fusion.addOutput(tv2);
 
-  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at_x = at::randn({4, 3}, options);
   std::vector<c10::IValue> aten_inputs = {at_x, 3, 2, 3, 4};
   auto at_y = at::slice(at::slice(at_x.reshape({3, 4}), 0, 0, 3), 1, 0, 2);
 
-  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   // Assert that we segmented into two segments
   auto seg_fusion =
-      fusion_executor_cache.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
   EXPECT_TRUE(seg_fusion->isSegmented());
   EXPECT_EQ(seg_fusion->groups().size(), 2);
 
   testValidate(
-      fusion_executor_cache.fusion(),
+      executor_cache.fusion(),
       outputs,
       aten_inputs,
       {at_y},
@@ -3179,9 +3180,9 @@ TEST_F(ResizeTest, CatOfBroadcast) {
   auto t1 = at::randn(shape1, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({t0, t1}, 0);
 
@@ -3216,9 +3217,9 @@ TEST_F(ResizeTest, CatOfExpandedBroadcast) {
   auto t1 = at::randn(shape1, options);
   std::vector<c10::IValue> aten_inputs({t0, t1});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::cat({at::expand_copy(t0, shape0e), t1}, 0);
 
@@ -3302,9 +3303,9 @@ TEST_P(ResizeTest, PadOfBroadcast) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3338,9 +3339,9 @@ TEST_P(ResizeTest, PadOfExpandedBroadcast) {
     EnableOptionsGuard::getCurOptions().unset(EnableOption::IdModel);
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -3374,7 +3375,7 @@ TEST_F(ResizeTest, DynamicReshapeIssue1393) {
   auto tv4 = expand(tv3, {s0, s1, s3});
   fusion->addOutput(tv4);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({3}, options).as_strided({3, 4}, {1, 0});
@@ -3382,7 +3383,7 @@ TEST_F(ResizeTest, DynamicReshapeIssue1393) {
   auto ref = t0.add(t1).as_strided({3, 4, 5}, {4, 1, 0});
 
   std::vector<c10::IValue> aten_inputs({t0, t1});
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   testValidate(fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
 }
@@ -3424,13 +3425,18 @@ TEST_F(ResizeTest, SqueezeSlicedExpand) {
   auto t0 = at::randn(shape0, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   auto ref = at::squeeze(at::slice(t0, 1, 2, 3), 1);
 
   testValidate(
-      fec.fusion(), cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      {ref},
+      __LINE__,
+      __FILE__);
 }
 
 // Vectorization through resize is not supported yet. Make sure
@@ -3602,14 +3608,18 @@ TEST_F(ResizeTest, Issue2552) {
   TensorView* z = add(x, y);
   fusion->addOutput(z);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::dtype(at::kFloat).device(at::kCUDA);
   at::Tensor x_tensor = at::randn({1, 3}, options);
   at::Tensor y_tensor = at::randn({1, 3}, options);
   std::vector<at::Tensor> out_tensors =
-      fec.runFusionWithInputs({x_tensor, y_tensor});
+      executor_cache.runFusionWithInputs({x_tensor, y_tensor});
   testValidate(
-      fec.fusion(), out_tensors, {x_tensor, y_tensor}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      out_tensors,
+      {x_tensor, y_tensor},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_F(ResizeTest, Chunk_NegativeSize) {
@@ -3623,11 +3633,11 @@ TEST_F(ResizeTest, Chunk_NegativeSize) {
     fusion->addOutput(out);
   }
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   EXPECT_THAT(
       [&]() {
         auto in_tensor = at::randn({13}).cuda();
-        fec.runFusionWithInputs({in_tensor});
+        executor_cache.runFusionWithInputs({in_tensor});
       },
       ThrowsMessage<nvfError>(HasSubstr("Invalid resized domain extent")));
 }
@@ -3643,10 +3653,11 @@ TEST_F(ResizeTest, Chunk_SizeZero) {
     fusion->addOutput(out);
   }
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto in_tensor = at::randn({15}).cuda();
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensors.back().numel(), 0);
 }
@@ -3662,10 +3673,11 @@ TEST_F(ResizeTest, Chunk_Uneven) {
     fusion->addOutput(out);
   }
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto in_tensor = at::randn({16}).cuda();
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
   EXPECT_EQ(out_tensors.back().numel(), 1);
 }
@@ -3715,9 +3727,9 @@ TEST_F(ResizeTest, SliceScheduledLikeProducer) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = t0.index({at::indexing::Slice(1, shape[0] - 1)});
 
@@ -3763,9 +3775,9 @@ TEST_F(ResizeTest, PadScheduledLikeConsumer) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(t0 + 1, {1, 1}) + 1;
 
@@ -3815,9 +3827,9 @@ TEST_F(ResizeTest, SliceThenPadLeftHalf) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(
       t0.index({at::indexing::Slice(0, shape[0] / 2)}), {0, shape[0] / 2});
@@ -3870,9 +3882,9 @@ TEST_F(ResizeTest, SliceThenPadRightHalf) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::pad(
       t0.index({at::indexing::Slice(shape[0] / 2, shape[0])}),
@@ -3934,9 +3946,9 @@ TEST_F(ResizeTest, SliceThenConcat) {
   auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   EXPECT_TRUE(t0.equal(cg_outputs[0]));
 }
@@ -4028,9 +4040,9 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) {
   auto t0 = at::randn({i0}, options);
   std::vector<c10::IValue> aten_inputs({t0});
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
 
   auto ref = at::concat(
       {at::slice(t0, 0, 0, rope_size / 2) + 1,
@@ -4041,4 +4053,74 @@ TEST_F(ResizeTest, SliceSliceConcatConcat) {
   NVF_CHECK(ref.equal(cg_outputs[0]));
 }
 
+// manual scheduling that should have vectorized load on padded inputs.
+TEST_F(ResizeTest, VectorizePadLowering) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  const std::vector<int64_t> shape({1024L * 1024L});
+
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = pad(tv0, {IrBuilder::create<Val>(4L), IrBuilder::create<Val>(4L)});
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 4);
+  tv1->split(0, 128);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({t0});
+
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
+
+  auto ref = at::pad(t0, {4, 4});
+  ASSERT_TRUE(ref.equal(cg_outputs[0]));
+}
+
+// manual scheduling that should have vectorized load.
+TEST_F(ResizeTest, VectorizeWhereLowering) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  const std::vector<int64_t> shape({1024L * 1024L});
+
+  // Note: nvfuser currently only supports vectorization with a single
+  // TensorView input.
+  auto s0 = IrBuilder::create<Val>(DataType::Bool);
+  fusion.addInput(s0);
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+  auto tv1 = where(s0, IrBuilder::create<Val>(2.0), tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 4);
+  tv1->split(0, 128);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> aten_inputs({at::Scalar(false), t0});
+
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
+  auto cg_outputs = ke.run(aten_inputs);
+
+  // Note: we cannot use at::where, because aten only support tensor as
+  // predicate.
+  ASSERT_TRUE(t0.equal(cg_outputs[0]));
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_rng.cpp b/tests/cpp/test_rng.cpp
index fb05848a86a..c8f7c545ae6 100644
--- a/tests/cpp/test_rng.cpp
+++ b/tests/cpp/test_rng.cpp
@@ -80,18 +80,23 @@ TEST_F(RNGTest, ValidateWithCURand) {
   fusion->addOutput(tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) {
     at::manual_seed(0);
-    auto cg_outputs = fec.runFusionWithInputs({size});
+    auto cg_outputs = executor_cache.runFusionWithInputs({size});
 
     at::manual_seed(0);
     auto ref0 = generate_uniform(size, at::kFloat);
     auto ref1 = generate_uniform(size, at::kDouble);
 
     testValidate(
-        fec.fusion(), cg_outputs, {size}, {ref0, ref1}, __LINE__, __FILE__);
+        executor_cache.fusion(),
+        cg_outputs,
+        {size},
+        {ref0, ref1},
+        __LINE__,
+        __FILE__);
   }
 }
 
@@ -116,11 +121,11 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand) {
   auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
   at::Tensor t0 = at::zeros({size}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0});
 
   at::manual_seed(0);
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
   auto out = cg_outputs[0];
 
   at::manual_seed(0);
@@ -154,11 +159,11 @@ TEST_F(RNGTest, ManualScheduleValidateWithCURand2) {
       /*maybe_symbolic=*/false);
   fusion->addOutput(tv0);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {10, 10, 10, 10});
+  KernelExecutor ke;
+  ke.compile(fusion, {10, 10, 10, 10});
 
   at::manual_seed(0);
-  auto cg_outputs = fe.runFusion({10, 10, 10, 10});
+  auto cg_outputs = ke.run({10, 10, 10, 10});
   auto out = cg_outputs[0];
 
   at::manual_seed(0);
@@ -182,13 +187,13 @@ TEST_F(RNGTest, BroadcastingRNG) {
     auto tv4 = add(tv0, tv3);
     fusion->addOutput(tv4);
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
     auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
     at::Tensor t0 = at::zeros({5, 1}, options);
     at::Tensor t1 = at::zeros({5, 5}, options);
 
-    auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+    auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
     auto out = cg_outputs[0];
     NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item<bool>())
     NVF_CHECK((out.select(1, 0) == out.select(1, 2)).all().item<bool>())
@@ -212,20 +217,21 @@ TEST_F(RNGTest, BroadcastingRNG2) {
       auto tv3 = add(tv1, tv2);
       fusion->addOutput(tv3);
 
-      FusionExecutorCache fec(std::move(fusion_ptr));
+      FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
       auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
       at::Tensor t0 = at::zeros({1}, options);
       at::Tensor t1 = at::zeros({size}, options);
 
       at::manual_seed(0);
-      auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+      auto cg_outputs = executor_cache.runFusionWithInputs({t0, t1});
       auto out = cg_outputs[0];
 
       at::manual_seed(0);
       auto ref = generate_uniform(1, dtype).expand_as(t1);
 
-      testValidate(fec.fusion(), {out}, {t0, t1}, {ref}, __LINE__, __FILE__);
+      testValidate(
+          executor_cache.fusion(), {out}, {t0, t1}, {ref}, __LINE__, __FILE__);
     }
   }
 }
@@ -287,9 +293,9 @@ TEST_F(RNGTest, BroadcastingRNGSmemNonSquareTile) {
   SchedulerEntry::makeSchedulerInstance(SchedulerType::Transpose)
       ->schedule(fusion, &tparams);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
   auto out = cg_outputs[0];
 
   NVF_CHECK((out.select(1, 0) == out.select(1, 1)).all().item<bool>());
@@ -314,18 +320,18 @@ TEST_F(RNGTest, Uniform) {
   fusion->addOutput(tv0);
   fusion->addOutput(tv1);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) {
     at::manual_seed(0);
-    auto cg_outputs = fec.runFusionWithInputs({size, -1.0, 1.0});
+    auto cg_outputs = executor_cache.runFusionWithInputs({size, -1.0, 1.0});
 
     at::manual_seed(0);
     auto ref0 = generate_uniform(size, at::kFloat) * 2 - 1;
     auto ref1 = generate_uniform(size, at::kDouble) * 2 - 1;
 
     testValidate(
-        fec.fusion(),
+        executor_cache.fusion(),
         cg_outputs,
         {size, -1.0, 1.0},
         {ref0, ref1},
@@ -354,11 +360,11 @@ TEST_F(RNGTest, Normal) {
   fusion->addOutput(tv2);
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) {
     at::manual_seed(0);
-    auto cg_outputs = fec.runFusionWithInputs({size, 1.0, 0.5});
+    auto cg_outputs = executor_cache.runFusionWithInputs({size, 1.0, 0.5});
 
     at::manual_seed(0);
     auto ref0 = generate_normal(size, at::kFloat) * 0.5f + 1.0f;
@@ -367,7 +373,7 @@ TEST_F(RNGTest, Normal) {
     auto ref3 = generate_normal(size, at::kDouble);
 
     testValidate(
-        fec.fusion(),
+        executor_cache.fusion(),
         cg_outputs,
         {size, 1.0, 0.5},
         {ref0, ref1, ref2, ref3},
@@ -389,13 +395,13 @@ TEST_F(RNGTest, RandLikeReduction) {
   auto tv3 = add(tv1, tv2);
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
   at::Tensor t0 = at::zeros({2, 3}, options);
 
   at::manual_seed(0);
-  auto cg_outputs = fec.runFusionWithInputs({t0});
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
   auto out = cg_outputs[0];
 
   at::manual_seed(0);
@@ -403,7 +409,7 @@ TEST_F(RNGTest, RandLikeReduction) {
   auto t2 = generate_uniform(3, dtype).expand_as(t1);
   auto t3 = t1.add(t2);
 
-  testValidate(fec.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__);
 }
 
 //! This is the same as the Uniform test, but we compare against
@@ -447,7 +453,7 @@ TEST_F(RNGTest, FunctionalUniform) {
     fusion->addOutput(tv2);
     fusion->addOutput(tv3);
 
-    FusionExecutorCache fec(std::move(fusion_ptr));
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
     for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) {
       at::manual_seed(0);
@@ -465,7 +471,7 @@ TEST_F(RNGTest, FunctionalUniform) {
       std::vector<c10::IValue> aten_inputs({size, -1.0, 1.0, 0, 0});
 
       at::manual_seed(0);
-      auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+      auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
       std::vector<at::Tensor> aten_outputs;
       if (do_stochastic) {
@@ -475,7 +481,7 @@ TEST_F(RNGTest, FunctionalUniform) {
       }
 
       testValidate(
-          fec.fusion(),
+          executor_cache.fusion(),
           cg_outputs,
           aten_inputs,
           aten_outputs,
@@ -514,7 +520,7 @@ TEST_F(RNGTest, DifferentOffsets) {
     fusion->addOutput(tv0);
   }
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
   std::unique_ptr<Fusion> fusion_ptr2 = std::make_unique<Fusion>();
   {
@@ -533,7 +539,7 @@ TEST_F(RNGTest, DifferentOffsets) {
   for (int64_t size : {1, 4}) {
     at::manual_seed(0);
     EXPECT_TRUE(get_current_offset() == 0);
-    auto r1 = fec.runFusionWithInputs({size}).at(0);
+    auto r1 = executor_cache.runFusionWithInputs({size}).at(0);
     EXPECT_TRUE(get_current_offset() == 4);
     auto r23 = fec2.runFusionWithInputs({size});
     auto r2 = r23.at(0);
diff --git a/tests/cpp/test_scalar_hoisting.cpp b/tests/cpp/test_scalar_hoisting.cpp
index 6aa08c52b53..d0295aa20f3 100644
--- a/tests/cpp/test_scalar_hoisting.cpp
+++ b/tests/cpp/test_scalar_hoisting.cpp
@@ -213,9 +213,9 @@ TEST_F(ScalarHoistTest, IndexHoist1) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({15, 17}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -257,9 +257,9 @@ TEST_F(ScalarHoistTest, IndexHoist2) {
   auto t0 = at::randn({16}, options);
   auto t1 = at::randn({16}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0, t1});
+  auto cg_outputs = ke.run({t0, t1});
 
   testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
 }
@@ -290,9 +290,9 @@ TEST_F(ScalarHoistTest, IndexHoist3) {
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::arange(10000, options).view({100, 100});
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0});
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0});
+  auto cg_outputs = ke.run({t0});
 
   const std::string expected_kernel = R"(
 __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T2) {
@@ -369,9 +369,9 @@ TEST_F(ScalarHoistTest, ARange) {
 
   int64_t start = 0, end = 100, step = 1;
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {start, end, step});
-  auto cg_outputs = fe.runFusion({start, end, step});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {start, end, step});
+  auto cg_outputs = ke.run({start, end, step});
 
   const std::string expected_kernel = R"(
 __global__ void CUDAGeneratedKernel(int64_t i0, int64_t i1, int64_t i2, Tensor<int64_t, 1, 1> T0, Tensor<int64_t, 1, 1> T1) {
diff --git a/tests/cpp/test_scatter_gather.cpp b/tests/cpp/test_scatter_gather.cpp
index 67237e0b5e4..5e1bfdd2eb1 100644
--- a/tests/cpp/test_scatter_gather.cpp
+++ b/tests/cpp/test_scatter_gather.cpp
@@ -132,7 +132,7 @@ TEST_F(ScatterGatherTest, TorchGatherAllRankAllSelectedDim) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto options_i = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
   for (const auto is_take_along : {false, true}) {
-    for (int rank = 1; rank <= 5; ++rank) {
+    for (int rank = 1; rank <= 3; ++rank) {
       for (int dim = 0; dim < rank; ++dim) {
         // this test uses a random input shape, clear the allocator to avoid
         // OOM.
@@ -586,10 +586,10 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise1) {
   auto t1 = at::randint(0, shape[1], {shape[0]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
-  auto outputs = fe.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -621,11 +621,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorPointwise2) {
   auto t1 = at::randint(0, shape[1], {shape[0]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise});
+      executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -655,11 +655,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction1) {
   auto t1 = at::randint(0, shape[0], {2}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(),
+      executor_cache.getMostRecentKernelRuntime(),
       {SchedulerType::Reduction, SchedulerType::PointWise});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
@@ -695,11 +695,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction2) {
   auto t1 = at::randint(0, shape[1], {shape[0]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(),
+      executor_cache.getMostRecentKernelRuntime(),
       {SchedulerType::PointWise, SchedulerType::Reduction});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
@@ -734,11 +734,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorReduction3) {
       at::randint(0, shape_before_gather[1], shape_after_gather, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::Reduction});
+      executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -776,11 +776,11 @@ TEST_F(ScatterGatherTest, DISABLED_TakeAlongAxisIntermediateTensorReduction4) {
       at::randint(0, shape_before_gather[1], shape_after_gather, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::Reduction});
+      executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Reduction});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -814,11 +814,12 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization1) {
   auto t1 = at::randint(0, shape[1], {shape[0]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent});
+      executor_cache.getMostRecentKernelRuntime(),
+      {SchedulerType::InnerPersistent});
 
   auto t0_d = t0.to(at::kDouble);
   auto ref = at::take_along_dim(
@@ -857,11 +858,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization2) {
   auto t1 = at::randint(0, shape[1], {shape[0]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(),
+      executor_cache.getMostRecentKernelRuntime(),
       {SchedulerType::PointWise, SchedulerType::InnerPersistent});
 
   auto t5 = at::take_along_dim(t0.to(at::kDouble) + 1, t1.unsqueeze(-1), 1)
@@ -902,11 +903,12 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorNormalization3) {
       at::randint(0, shape_before_gather[1], shape_after_gather, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent});
+      executor_cache.getMostRecentKernelRuntime(),
+      {SchedulerType::InnerPersistent});
 
   auto t3 = at::take_along_dim(t0.to(at::kDouble) + 1, t1, 1);
   auto ref = t3 / t3.sum({1}).unsqueeze(-1);
@@ -943,13 +945,13 @@ TEST_F(
   auto t1 = at::randint(0, shape[1], {shape[0], 1}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   // The reduction patterns of the normalization and the final
   // reduction are different, so they are segmented out
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(),
+      executor_cache.getMostRecentKernelRuntime(),
       {SchedulerType::InnerPersistent, SchedulerType::Reduction});
 
   auto t0_d = t0.to(at::kDouble);
@@ -995,11 +997,12 @@ TEST_F(
   auto t1 = at::randint(0, shape[1], {shape[0]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::InnerPersistent});
+      executor_cache.getMostRecentKernelRuntime(),
+      {SchedulerType::InnerPersistent});
 
   auto t0_d = t0.to(at::kDouble);
   auto t6 = at::take_along_dim(
@@ -1045,11 +1048,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose1) {
   auto t1 = at::randint(0, shape[0], {shape[1], shape[2]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::Transpose});
+      executor_cache.getMostRecentKernelRuntime(), {SchedulerType::Transpose});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1088,11 +1091,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose2) {
   auto t1 = at::randint(0, shape[0], {10, shape[2], shape[1]}, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise});
+      executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1133,13 +1136,13 @@ TEST_F(ScatterGatherTest, TakeAlongAxisIntermediateTensorTranspose3) {
   auto t1 = at::randint(0, shape_before[2], shape_after, options_i);
   std::vector<c10::IValue> aten_inputs = {t0, t1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
   // Transpose scheduler should work for this case but not currently
   // supported
   validateSegmentation(
-      fec.getMostRecentKernelRuntime(), {SchedulerType::PointWise});
+      executor_cache.getMostRecentKernelRuntime(), {SchedulerType::PointWise});
 
   testValidate(&fusion, outputs, aten_inputs, __LINE__, __FILE__);
 }
@@ -1188,11 +1191,11 @@ TEST_F(ScatterGatherTest, TakeAlongAxisCrossEntropyLoss) {
   auto t1 = at::randint(371, {128}, options).to(at::ScalarType::Long);
   std::vector<c10::IValue> inputs({t0, t1});
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
 
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
 
-  auto kernel_runtime = fec.getMostRecentKernelRuntime();
+  auto kernel_runtime = executor_cache.getMostRecentKernelRuntime();
 
   validateSegmentation(
       kernel_runtime,
@@ -1290,10 +1293,10 @@ TEST_F(ScatterGatherTest, GatherIterGoupedReduction) {
       " grouped iterations, found ",
       gpulw.kernel()->summary().num_grouped_iterations);
 
-  FusionExecutor fe;
+  KernelExecutor ke;
   auto lparams = rparams->lparams;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+  ke.compile(&fusion, aten_inputs, lparams);
+  auto cg_outputs = ke.run(aten_inputs, lparams);
 
   auto t_gather = at::gather(input, dim, input_idx);
   testValidate(
diff --git a/tests/cpp/test_sdpa_node.cpp b/tests/cpp/test_sdpa_node.cpp
index 772945f0909..b63986d5a0b 100644
--- a/tests/cpp/test_sdpa_node.cpp
+++ b/tests/cpp/test_sdpa_node.cpp
@@ -252,8 +252,8 @@ TEST_F(SDPATest, NonCausalAttnConcrete) {
       /*return_debug_mask=*/false,
       scale);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto nvf_out = fec.runFusionWithInputs({q, k, v});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto nvf_out = executor_cache.runFusionWithInputs({q, k, v});
   validateSdpaFwdOutputs(nvf_out, aten_out);
 }
 
@@ -299,8 +299,8 @@ TEST_F(SDPATest, NonCausalAttnSymbolic) {
       /*return_debug_mask=*/false,
       scale);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto nvf_out = fec.runFusionWithInputs({q, k, v});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto nvf_out = executor_cache.runFusionWithInputs({q, k, v});
   validateSdpaFwdOutputs(nvf_out, aten_out);
 }
 
@@ -345,8 +345,8 @@ TEST_F(SDPATest, CausalAttn) {
       /*return_debug_mask=*/false,
       /*scale=*/1e-3);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto nvf_out = fec.runFusionWithInputs({q, k, v});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto nvf_out = executor_cache.runFusionWithInputs({q, k, v});
   validateSdpaFwdOutputs(nvf_out, aten_out);
 }
 
@@ -496,8 +496,8 @@ TEST_F(SDPATest, NonCausalAttnConcreteBwd) {
   std::vector<c10::IValue> sdpa_bwd_inputs = {
       grad_out, q, k, v, output, log_sumexp, philox_seed, philox_offset};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out = fec.runFusionWithInputs(sdpa_bwd_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs);
 
   auto [ref_grad_query, ref_grad_key, ref_grad_value] =
       at::_scaled_dot_product_flash_attention_backward(
@@ -518,7 +518,7 @@ TEST_F(SDPATest, NonCausalAttnConcreteBwd) {
           /*scale=*/scale);
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       out,
       sdpa_bwd_inputs,
       {ref_grad_query, ref_grad_key, ref_grad_value},
@@ -605,8 +605,8 @@ TEST_F(SDPATest, NonCausalAttnSymbolicBwd) {
   std::vector<c10::IValue> sdpa_bwd_inputs = {
       grad_out, q, k, v, output, log_sumexp, philox_seed, philox_offset};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out = fec.runFusionWithInputs(sdpa_bwd_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs);
 
   auto [ref_grad_query, ref_grad_key, ref_grad_value] =
       at::_scaled_dot_product_flash_attention_backward(
@@ -627,7 +627,7 @@ TEST_F(SDPATest, NonCausalAttnSymbolicBwd) {
           /*scale=*/scale);
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       out,
       sdpa_bwd_inputs,
       {ref_grad_query, ref_grad_key, ref_grad_value},
@@ -683,8 +683,8 @@ TEST_F(SDPATest, AttnProgram) {
       scale);
   auto expected_out = (std::get<0>(aten_outputs).to(at::kFloat)) * 2.0;
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out = fec.runFusionWithInputs({q, k, v});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out = executor_cache.runFusionWithInputs({q, k, v});
   EXPECT_TRUE(at::allclose(out[0], expected_out));
 }
 
@@ -744,8 +744,8 @@ TEST_F(SDPATest, AttnFwdBwd) {
   at::Tensor v = at::randn(v_shape, options).set_requires_grad(true);
   at::Tensor grad_out = at::randn(attn_shape, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto nvf_out = fec.runFusionWithInputs({q, k, v, grad_out});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto nvf_out = executor_cache.runFusionWithInputs({q, k, v, grad_out});
 
   auto attn = at::scaled_dot_product_attention(
       q,
@@ -761,7 +761,7 @@ TEST_F(SDPATest, AttnFwdBwd) {
   attn.backward(grad_out);
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       nvf_out,
       {q, k, v, grad_out},
       {attn, q.grad(), k.grad(), v.grad()},
@@ -824,9 +824,9 @@ TEST_F(SDPATest, Sharded_SdpaFwd) {
       /*return_debug_mask=*/false,
       scale);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto nvf_out =
-      fec.runFusionWithInputs({q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto nvf_out = executor_cache.runFusionWithInputs(
+      {q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)});
   validateSdpaFwdOutputs(nvf_out, aten_out);
 }
 
@@ -928,8 +928,8 @@ TEST_F(SDPATest, Sharded_SdpaBwd) {
       philox_seed,
       philox_offset};
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out = fec.runFusionWithInputs(sdpa_bwd_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out = executor_cache.runFusionWithInputs(sdpa_bwd_inputs);
 
   auto [ref_grad_query, ref_grad_key, ref_grad_value] =
       at::_scaled_dot_product_flash_attention_backward(
@@ -950,7 +950,7 @@ TEST_F(SDPATest, Sharded_SdpaBwd) {
           /*scale=*/scale);
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       out,
       sdpa_bwd_inputs,
       {ref_grad_query.unsqueeze(0),
@@ -1016,9 +1016,9 @@ TEST_F(SDPATest, ComputeAt) {
       /*return_debug_mask=*/false,
       scale);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto nvf_out =
-      fec.runFusionWithInputs({q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto nvf_out = executor_cache.runFusionWithInputs(
+      {q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0)});
   validateSdpaFwdOutputs(nvf_out, aten_out);
 }
 
diff --git a/tests/cpp/test_segmentation.cpp b/tests/cpp/test_segmentation.cpp
index c3a69b09cbb..b893c5c29ad 100644
--- a/tests/cpp/test_segmentation.cpp
+++ b/tests/cpp/test_segmentation.cpp
@@ -45,10 +45,10 @@ TEST_F(SegmentationTest, Issue1284_Repro1) {
   at::Tensor at_in_1 = at::randn(input_shape_1, options);
   std::vector<c10::IValue> aten_inputs = {at_in_0, at_in_1};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2);
 
   testValidate(&fusion, outputs, {at_in_0, at_in_1}, __LINE__, __FILE__);
@@ -84,10 +84,10 @@ TEST_F(SegmentationTest, Issue1284_Repro2) {
 
   std::vector<c10::IValue> aten_inputs = {at_in_0, at_in_1, at_in_2};
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2);
 
   testValidate(
@@ -147,12 +147,14 @@ TEST_F(SegmentationTest, SegmentHintOnNonTerminatingOutput) {
   fusion->addOutput(add_out);
   fusion->addOutput(mul_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   // Segment 1: in -> add_out (defined by segment_set)
   // Segment 2: add_out -> mul_out
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2);
@@ -195,18 +197,19 @@ TEST_F(SegmentationTest, EnforceSegmentationByCachingBeforeAndAfter) {
     }
   }
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       out_tensors,
       {in_tensor},
       {in_tensor / in_tensor.sum({0})},
       __LINE__,
       __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 2);
 }
 
@@ -225,10 +228,12 @@ TEST_F(SegmentationTest, SetAllocationDomainOnSegmentBoundary) {
   add_out->setAllocationDomain(
       {add_out->axis(0), add_out->axis(1), add_out->axis(2)}, false);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3, 5}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(SegmentationTest, InputForwardingUntilBinary) {
@@ -254,16 +259,20 @@ TEST_F(SegmentationTest, InputForwardingUntilBinary) {
   z = segment_set(z);
   fusion->addOutput(z);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 3}, options);
   std::vector<at::Tensor> out_tensors =
-      fec.runFusionWithInputs({in_tensor, in_tensor});
+      executor_cache.runFusionWithInputs({in_tensor, in_tensor});
   testValidate(
-      fec.fusion(), out_tensors, {in_tensor, in_tensor}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      out_tensors,
+      {in_tensor, in_tensor},
+      __LINE__,
+      __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_EQ(runtime->fusionSegments()->groups().size(), 1);
 }
 
@@ -285,14 +294,18 @@ TEST_F(SegmentationTest, InputForwardingUntilOutput) {
   fusion->addOutput(out0);
   fusion->addOutput(out1);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({2, 3}, options);
   std::vector<at::Tensor> out_tensors =
-      fec.runFusionWithInputs({in_tensor, in_tensor});
+      executor_cache.runFusionWithInputs({in_tensor, in_tensor});
   testValidate(
-      fec.fusion(), out_tensors, {in_tensor, in_tensor}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      out_tensors,
+      {in_tensor, in_tensor},
+      __LINE__,
+      __FILE__);
 }
 
 TEST_F(SegmentationTest, ForwardedExprsAreNotMergeable) {
@@ -308,9 +321,10 @@ TEST_F(SegmentationTest, ForwardedExprsAreNotMergeable) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in_tensor = at::randn({10}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(SegmentationTest, ForwardedExprsAreReplicated) {
@@ -328,9 +342,10 @@ TEST_F(SegmentationTest, ForwardedExprsAreReplicated) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in_tensor = at::randn({10, 20}, options);
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
 TEST_F(SegmentationTest, ForceFp16Simple) {
@@ -356,18 +371,18 @@ TEST_F(SegmentationTest, ForceFp16Simple) {
 
   fusion->addOutput(tv5);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   std::vector<int64_t> shape{15, 16};
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn(shape, options);
   auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
+  executor_cache.runFusionWithInputs({in0, in1});
 
   // Check the segmented edge is fp16
   SegmentedFusion* segmented_fusion =
-      fec.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
   for (SegmentedEdge* edge : segmented_fusion->edges()) {
     auto* edge_tv = edge->val->as<TensorView>();
     EXPECT_EQ(edge_tv->getDataType(), DataType::Half);
@@ -406,18 +421,18 @@ TEST_F(SegmentationTest, ForceBf16Simple) {
 
   fusion->addOutput(tv5);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   std::vector<int64_t> shape{15, 16};
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn(shape, options);
   auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
+  executor_cache.runFusionWithInputs({in0, in1});
 
   // Check the segmented edge is bf16
   SegmentedFusion* segmented_fusion =
-      fec.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
   for (SegmentedEdge* edge : segmented_fusion->edges()) {
     auto* edge_tv = edge->val->as<TensorView>();
     EXPECT_EQ(edge_tv->getDataType(), DataType::BFloat16);
@@ -452,17 +467,17 @@ TEST_F(SegmentationTest, ForceFp16NotAllCast) {
   fusion->addOutput(tv7);
   fusion->addOutput(tv8);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   std::vector<int64_t> shape{16, 16, 16};
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn(shape, options);
   auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
+  executor_cache.runFusionWithInputs({in0, in1});
 
   SegmentedFusion* segmented_fusion =
-      fec.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
   Fusion* complete_fusion = segmented_fusion->completeFusion();
 
   // Check that the edge that wasn't fp16 is the producer of the
@@ -513,17 +528,17 @@ TEST_F(SegmentationTest, ForceBf16NotAllCast) {
   fusion->addOutput(tv7);
   fusion->addOutput(tv8);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   std::vector<int64_t> shape{16, 16, 16};
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn(shape, options);
   auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
+  executor_cache.runFusionWithInputs({in0, in1});
 
   SegmentedFusion* segmented_fusion =
-      fec.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
   Fusion* complete_fusion = segmented_fusion->completeFusion();
 
   // Check that the edge that wasn't fp16 is the producer of the
@@ -558,14 +573,14 @@ TEST_F(SegmentationTest, SliceSegmentCasts) {
 
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
   auto in0 = at::randn({5}, options);
-  auto outputs = fec.runFusionWithInputs({in0});
+  auto outputs = executor_cache.runFusionWithInputs({in0});
 
   SegmentedFusion* segmented_fusion =
-      fec.getMostRecentKernelRuntime()->fusionSegments();
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments();
 
   ASSERT_EQ(segmented_fusion->edges().size(), 1);
 
@@ -579,7 +594,7 @@ TEST_F(SegmentationTest, SliceSegmentCasts) {
   // There should be no cast before the slice
   EXPECT_TRUE(slice_edge->val->uses().at(0)->isA<SliceOp>());
 
-  testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__);
 }
 
 TEST_F(SegmentationTest, codeGenSupportedMergeIssue1970) {
@@ -596,13 +611,13 @@ TEST_F(SegmentationTest, codeGenSupportedMergeIssue1970) {
   auto* tv3 = segment_set(tv2);
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({3, 4, 3}, options);
-  auto outputs = fec.runFusionWithInputs({in0});
+  auto outputs = executor_cache.runFusionWithInputs({in0});
 
-  testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__);
 }
 
 // Test that Reduction axes are removed in segmentation edges
@@ -622,15 +637,16 @@ TEST_F(SegmentationTest, EraseReductionsInSegmentationEdges) {
 
   fusion->addOutput(tv3);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({3, 32, 17}, options);
-  auto outputs = fec.runFusionWithInputs({in0});
+  auto outputs = executor_cache.runFusionWithInputs({in0});
 
-  testValidate(fec.fusion(), outputs, {in0}, __LINE__, __FILE__);
+  testValidate(executor_cache.fusion(), outputs, {in0}, __LINE__, __FILE__);
 
-  const FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  const FusionKernelRuntime* runtime =
+      executor_cache.getMostRecentKernelRuntime();
   ASSERT_TRUE(runtime != nullptr);
 
   SegmentedFusion* segmented_fusion = runtime->fusionSegments();
@@ -662,18 +678,18 @@ TEST_F(SegmentationTest, AliasedOutputOnSegmentation) {
   auto* tv2 = relu(seg_out);
   fusion->addOutput(tv2);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto in0 = at::randn({2, 3, 4}, options);
   auto in0_ref = in0.clone();
 
-  auto outputs = fec.runFusionWithInputs({in0});
+  auto outputs = executor_cache.runFusionWithInputs({in0});
   auto in0_neg = in0_ref.neg();
   EXPECT_TRUE(in0_neg.allclose(in0));
 
   testValidate(
-      fec.fusion(),
+      executor_cache.fusion(),
       outputs,
       {in0.clone()},
       {in0_neg.relu()},
@@ -693,14 +709,15 @@ TEST_F(SegmentationTest, MultipleSegmentSetsInOneSegment) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor in_tensor = at::randn({10}, options);
-  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
 
-  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+  testValidate(
+      executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
 
-  FusionKernelRuntime* runtime = fec.getMostRecentKernelRuntime();
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
   EXPECT_THAT(runtime->fusionSegments()->groups(), SizeIs(2));
 }
 
@@ -723,10 +740,12 @@ TEST_F(SegmentationTest, ForwardInputsToSegmenterSetIssue2658) {
   fusion->addOutput(permute_out);
   fusion->addOutput(compute_out);
 
-  FusionExecutorCache fec(std::move(fusion));
+  FusionExecutorCache executor_cache(std::move(fusion));
   at::Tensor in_tensor = at::randn({2, 3}).cuda();
-  std::vector<at::Tensor> out_tensors = fec.runFusionWithInputs({in_tensor});
-  testValidate(fec.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+  std::vector<at::Tensor> out_tensors =
+      executor_cache.runFusionWithInputs({in_tensor});
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
 }
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_serial_gridreduce.cpp b/tests/cpp/test_serial_gridreduce.cpp
index 15d035a75d6..13ee5d77df9 100644
--- a/tests/cpp/test_serial_gridreduce.cpp
+++ b/tests/cpp/test_serial_gridreduce.cpp
@@ -116,15 +116,15 @@ TEST_F(SerialGridReductionTest, Scheduling) {
 
         inlineMost();
 
-        FusionExecutor fe;
+        KernelExecutor ke;
         if (serial) {
           tv3->definition()->as<ReductionOp>()->requestSerialGridReduction();
         }
-        fe.compileFusion(fusion);
+        ke.compile(fusion);
 
         auto input = at::randn(
             {H, W}, at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0));
-        auto outputs = fe.runFusion({input});
+        auto outputs = ke.run({input});
 
         if (serial) {
           // Verify that zeroed semaphore memory was reused instead of
diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp
index c813bde5225..6738d99857c 100644
--- a/tests/cpp/test_sharding.cpp
+++ b/tests/cpp/test_sharding.cpp
@@ -155,9 +155,9 @@ TEST_P(ShardingTest, ComputeIndex) {
   // Dimension 2 has size 1 because that dimension is DIDx parallelized.
   auto a_tensor = at::randn({4, 2, 1, 5}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {a_tensor});
-  auto outputs = fe.runFusion({a_tensor});
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {a_tensor});
+  auto outputs = ke.run({a_tensor});
   testValidate(fusion.get(), outputs, {a_tensor}, __LINE__, __FILE__);
 }
 
diff --git a/tests/cpp/test_smem_reuse.cpp b/tests/cpp/test_smem_reuse.cpp
index 295fd3c2345..5a258ab6917 100644
--- a/tests/cpp/test_smem_reuse.cpp
+++ b/tests/cpp/test_smem_reuse.cpp
@@ -556,9 +556,9 @@ TEST_F(SmemReuseTest, SmemReuseWithDifferentVectorizationFactor) {
   }
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({n_element}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto cg_outputs = fe.runFusion({t0});
+  KernelExecutor ke;
+  ke.compile(fusion.get());
+  auto cg_outputs = ke.run({t0});
   testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
@@ -616,9 +616,9 @@ TEST_F(SmemReuseTest, RegisterReuseWithDifferentVectorizationFactor) {
     // run the fusion
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     auto t0 = at::randn({n_element}, options);
-    FusionExecutor fe;
-    fe.compileFusion(fusion.get());
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(fusion.get());
+    auto cg_outputs = ke.run({t0});
     testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__);
   };
 
@@ -677,9 +677,9 @@ TEST_F(SmemReuseTest, ExpandInterferes) {
 
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     at::Tensor t0 = at::randn({y}, options);
-    FusionExecutor fe;
-    fe.compileFusion(fusion.get());
-    auto cg_outputs = fe.runFusion({t0});
+    KernelExecutor ke;
+    ke.compile(fusion.get());
+    auto cg_outputs = ke.run({t0});
     testValidate(fusion.get(), cg_outputs, {t0}, __LINE__, __FILE__);
   };
 
diff --git a/tests/cpp/test_swizzle.cpp b/tests/cpp/test_swizzle.cpp
index f2cb00546eb..d4e910a2522 100644
--- a/tests/cpp/test_swizzle.cpp
+++ b/tests/cpp/test_swizzle.cpp
@@ -54,12 +54,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle0) {
   auto str = ir_utils::toString(exprs);
   NVF_CHECK(str.find("where") != std::string::npos);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({2, 32}, options);
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -93,12 +93,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle1) {
   // Inlining a producer into a swizzled consumer is ok
   tv1->computeAt(tv2, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({2, 32}, options);
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -150,12 +150,12 @@ TEST_F(LegacySwizzleTest, SimpleSwizzle2) {
     }
   }
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({32, 32}, options);
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -279,12 +279,12 @@ TEST_F(LegacySwizzleTest, LoopSwizzle0) {
 
   tv0->computeAt(tv2, -1);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({2, 32}, options);
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -314,12 +314,12 @@ TEST_F(LegacySwizzleTest, LoopSwizzle1) {
   tv2->axis(0)->parallelize(ParallelType::BIDx);
   tv2->axis(1)->parallelize(ParallelType::BIDy);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
+  KernelExecutor ke;
+  ke.compile(&fusion);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t0 = at::randn({45, 77}, options);
-  auto cg_outputs = fe.runFusion({t0});
+  auto cg_outputs = ke.run({t0});
 
   testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
@@ -349,8 +349,8 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck0) {
 
   tv0->computeAt(tv2, -1);
 
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+  KernelExecutor ke;
+  ASSERT_ANY_THROW(ke.compile(&fusion));
 }
 
 // Test assertion in unsupported pattern: half-inlined loop swizzle.
@@ -381,8 +381,8 @@ TEST_F(LegacySwizzleTest, LoopSwizzleCheck1) {
   // Make tv2 swizzled and partially-inlined (unsupported).
   tv0->computeAt(tv3, -2);
 
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+  KernelExecutor ke;
+  ASSERT_ANY_THROW(ke.compile(&fusion));
 }
 
 TEST_F(LegacySwizzleTest, SwizzleVectorize) {
@@ -528,8 +528,8 @@ at::Tensor getSwizzledTensor(
   fusion.addOutput(swizzle.first);
   fusion.addOutput(swizzle.second);
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs({size_x, size_y});
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs({size_x, size_y});
 
   return input.index_put({outputs[0], outputs[1]}, input);
 }
@@ -615,9 +615,9 @@ TEST_F(LegacySwizzleTest, SwizzleIndexing170) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t = at::randn({64, 64}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t});
+  KernelExecutor ke;
+  ke.compile(&fusion);
+  auto outputs = ke.run({t});
 
   testValidate(&fusion, outputs, {t}, __LINE__, __FILE__);
 }
@@ -678,9 +678,9 @@ TEST_F(LegacySwizzleTest, SwizzleInProducerProjection) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t = at::randn({32, 64}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto outputs = fe.runFusion({t});
+  KernelExecutor ke;
+  ke.compile(fusion.get());
+  auto outputs = ke.run({t});
 
   auto expect = at::empty_like(t);
   for (auto i : c10::irange(t.size(0) / 8)) {
@@ -735,10 +735,10 @@ TEST_F(SwizzleTest, Transpose1) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t = at::randn({10240, 10240}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t});
-  EXPECT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  std::vector<at::Tensor> outputs = fe.runFusion({t});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t});
+  EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty());
+  std::vector<at::Tensor> outputs = ke.run({t});
   EXPECT_TRUE(at::equal(t.t(), outputs[0]));
 }
 
diff --git a/tests/cpp/test_tensor_factories.cpp b/tests/cpp/test_tensor_factories.cpp
index 8c3b2462ca7..2eabde38b3b 100644
--- a/tests/cpp/test_tensor_factories.cpp
+++ b/tests/cpp/test_tensor_factories.cpp
@@ -352,9 +352,9 @@ TEST_F(TensorFactoryTest, TensorConstruct) {
   auto output = tensor(std::vector<std::vector<Val*>>{{i00, i01}, {i10, i11}});
   fusion->addOutput(output);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto cg_outputs = fe.runFusion({00, 01, 10, 11});
+  KernelExecutor ke;
+  ke.compile(fusion.get());
+  auto cg_outputs = ke.run({00, 01, 10, 11});
 
   testValidate(fusion.get(), cg_outputs, {00, 01, 10, 11}, __LINE__, __FILE__);
 }
@@ -403,9 +403,9 @@ TEST_F(TensorFactoryTest, MetadataAsTensor) {
   auto input0 = at::randn({2, 3, 4, 5}, options);
   auto input1 = at::randn({6, 7, 8, 9}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get());
-  auto cg_outputs = fe.runFusion({input0, input1});
+  KernelExecutor ke;
+  ke.compile(fusion.get());
+  auto cg_outputs = ke.run({input0, input1});
 
   testValidate(fusion.get(), cg_outputs, {input0, input1}, __LINE__, __FILE__);
 }
diff --git a/tests/cpp/test_translate_mma.cpp b/tests/cpp/test_translate_mma.cpp
index 420bdf1a760..14e9a9d222f 100644
--- a/tests/cpp/test_translate_mma.cpp
+++ b/tests/cpp/test_translate_mma.cpp
@@ -229,11 +229,11 @@ TEST_P(CombineMulSumAsMmaTestWithLayout, AmpereMulSumToMatmul_Schedule) {
 
   auto inputs = matmulAtInput2D(M, N, K, layout);
 
-  FusionExecutor fe;
-  fe.compileFusion(
+  KernelExecutor ke;
+  ke.compile(
       &fusion, {inputs.first, inputs.second}, LaunchParams(), matmul_cparams);
-  ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
-  auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+  ASSERT_TRUE(getBankConflictInfo(ke.kernel()).empty());
+  auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
       inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
   NVF_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
diff --git a/tests/cpp/test_tutorial.cpp b/tests/cpp/test_tutorial.cpp
index 17e93767a1f..943b2fc6504 100644
--- a/tests/cpp/test_tutorial.cpp
+++ b/tests/cpp/test_tutorial.cpp
@@ -82,12 +82,12 @@ TEST_F(Tutorial, Memcpy) {
   std::vector<c10::IValue> aten_inputs = {t0};
 
   // Next, lower the fusion to Kernel, generate CUDA kernel source and then
-  // compile it with nvrtc. All of them are done by FusionExecutor
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
+  // compile it with nvrtc. All of them are done by KernelExecutor
+  KernelExecutor ke;
+  ke.compile(&fusion, aten_inputs);
 
-  // FusionExecutor now has a compiled kernel, which can be executed as:
-  std::vector<at::Tensor> outputs = fe.runFusion(aten_inputs);
+  // KernelExecutor now has a compiled kernel, which can be executed as:
+  std::vector<at::Tensor> outputs = ke.run(aten_inputs);
   // Note that this run is done using just one thread, which will be
   // corrected below.
 
@@ -158,15 +158,15 @@ TEST_F(Tutorial, Memcpy) {
   }
 
   // Since the fusion is modified, we need to recompile it.
-  FusionExecutor fe2;
-  fe2.compileFusion(&fusion, aten_inputs);
+  KernelExecutor ke2;
+  ke2.compile(&fusion, aten_inputs);
 
   // This time, the kernel is launched with multiple threads and
   // thread blocks. Note that the launch configurations, i.e., the
   // thread block and grid shapes, are autoatically inferred from the
   // given inputs. To see how many threads are used, run this test
   // with NVFUSER_DUMP=launch_param
-  outputs = fe2.runFusion(aten_inputs);
+  outputs = ke2.run(aten_inputs);
 
   ASSERT_TRUE(outputs[0].equal(t0));
 }
@@ -205,9 +205,9 @@ TEST_F(Tutorial, Reduction) {
   at::Tensor ref = t0.sum({1});
 
   {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-    std::vector<at::Tensor> outputs = fe.runFusion(aten_inputs);
+    KernelExecutor ke;
+    ke.compile(&fusion);
+    std::vector<at::Tensor> outputs = ke.run(aten_inputs);
     testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
   }
 
@@ -221,9 +221,9 @@ TEST_F(Tutorial, Reduction) {
   }
 
   {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-    std::vector<at::Tensor> outputs = fe.runFusion(aten_inputs);
+    KernelExecutor ke;
+    ke.compile(&fusion);
+    std::vector<at::Tensor> outputs = ke.run(aten_inputs);
     testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
   }
 
@@ -239,19 +239,19 @@ TEST_F(Tutorial, Reduction) {
   }
 
   {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
+    KernelExecutor ke;
+    ke.compile(&fusion);
     // Running this fusion, however, should fail as it would require
     // thread blocks of shape 1024x10, i.e., the same shape as the
     // input tensor, which is too large in CUDA.
     //
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-    ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+    ASSERT_ANY_THROW(ke.run(aten_inputs));
 
     // Try again with a smaller input. This should launch a kernel
     // with thread blocks of shape 32x10
     at::Tensor t1 = at::randn({10, 32}, options);
-    std::vector<at::Tensor> outputs = fe.runFusion({t1});
+    std::vector<at::Tensor> outputs = ke.run({t1});
     testValidate(
         &fusion, outputs, aten_inputs, {t1.sum({1})}, __LINE__, __FILE__);
   }
@@ -266,13 +266,13 @@ TEST_F(Tutorial, Reduction) {
   }
 
   {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
+    KernelExecutor ke;
+    ke.compile(&fusion);
     // The original input should not fail in this case. The kernel
     // will be launched with 10 thread blocks, each of which has 1024
     // threads. Try running this test with NVFUSER_DUMP=launch_param
     // to see the launch configuration of each kernel lauch
-    std::vector<at::Tensor> outputs = fe.runFusion(aten_inputs);
+    std::vector<at::Tensor> outputs = ke.run(aten_inputs);
     testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
   }
 }
@@ -380,13 +380,13 @@ TEST_F(Tutorial, ReductionRFactor) {
     std::vector<c10::IValue> aten_inputs = {t0};
     at::Tensor ref = t0.sum({0});
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion_copy);
+    KernelExecutor ke;
+    ke.compile(&fusion_copy);
 
     // Since the size of the input is 10000, which is split by a
     // factor of 1024, the first per-thread reduction is done for
     // ceilDiv(10000, 1024) = 10 elements.
-    std::vector<at::Tensor> outputs = fe.runFusion(aten_inputs);
+    std::vector<at::Tensor> outputs = ke.run(aten_inputs);
     testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
   }
 
@@ -439,10 +439,10 @@ TEST_F(Tutorial, ReductionRFactor) {
     std::vector<c10::IValue> aten_inputs = {t0};
     at::Tensor ref = t0.sum({0});
 
-    FusionExecutor fe;
-    fe.compileFusion(&fusion_copy);
+    KernelExecutor ke;
+    ke.compile(&fusion_copy);
 
-    std::vector<at::Tensor> outputs = fe.runFusion(aten_inputs);
+    std::vector<at::Tensor> outputs = ke.run(aten_inputs);
     testValidate(&fusion_copy, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
   }
 }
@@ -786,9 +786,9 @@ TEST_F(Tutorial, BasicTMA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     std::vector<int64_t> shape(3, 300);
     auto t = at::randn(shape, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t}, {}, index32bit);
-    std::vector<at::Tensor> outputs = fe.runFusion({t});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t}, {}, index32bit);
+    std::vector<at::Tensor> outputs = ke.run({t});
     ASSERT_TRUE(at::equal(t, outputs[0]));
   }
 
@@ -870,9 +870,9 @@ TEST_F(Tutorial, BasicTMA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     std::vector<int64_t> shape(3, 300);
     auto t = at::randn(shape, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t}, {}, index32bit);
-    std::vector<at::Tensor> outputs = fe.runFusion({t});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t}, {}, index32bit);
+    std::vector<at::Tensor> outputs = ke.run({t});
     ASSERT_TRUE(at::equal(t, outputs[0]));
   }
 
@@ -953,9 +953,9 @@ TEST_F(Tutorial, BasicTMA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     std::vector<int64_t> shape(3, 300);
     auto t = at::randn(shape, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t}, {}, index32bit);
-    std::vector<at::Tensor> outputs = fe.runFusion({t});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t}, {}, index32bit);
+    std::vector<at::Tensor> outputs = ke.run({t});
     ASSERT_TRUE(at::equal(t, outputs[0]));
   }
 
@@ -1033,9 +1033,9 @@ TEST_F(Tutorial, BasicTMA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     std::vector<int64_t> shape(3, 300);
     auto t = at::randn(shape, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t}, {}, index32bit);
-    std::vector<at::Tensor> outputs = fe.runFusion({t});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t}, {}, index32bit);
+    std::vector<at::Tensor> outputs = ke.run({t});
     ASSERT_TRUE(at::equal(t, outputs[0]));
   }
 
@@ -1138,9 +1138,9 @@ TEST_F(Tutorial, BasicTMA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     std::vector<int64_t> shape(3, 300);
     auto t = at::randn(shape, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t}, {}, index32bit);
-    std::vector<at::Tensor> outputs = fe.runFusion({t});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t}, {}, index32bit);
+    std::vector<at::Tensor> outputs = ke.run({t});
     ASSERT_TRUE(at::equal(t, outputs[0]));
   }
 
@@ -1244,9 +1244,9 @@ TEST_F(Tutorial, BasicTMA) {
     auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
     std::vector<int64_t> shape(3, 300);
     auto t = at::randn(shape, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t}, {}, index32bit);
-    std::vector<at::Tensor> outputs = fe.runFusion({t});
+    KernelExecutor ke;
+    ke.compile(&fusion, {t}, {}, index32bit);
+    std::vector<at::Tensor> outputs = ke.run({t});
     ASSERT_TRUE(at::equal(t, outputs[0]));
   }
 }
@@ -1343,10 +1343,10 @@ TEST_F(Tutorial, VectorizeStorePointwiseTMA) {
   at::Tensor at_tv0 = at::randn({dim0, dim1}, options);
   at::Tensor at_tv1 = at::randn({dim0, dim1}, options);
 
-  // Compile with FusionExecutor directly to avoid scheduling
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit);
-  auto outputs = fe.runFusion({at_tv0, at_tv1});
+  // Compile with KernelExecutor directly to avoid scheduling
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {at_tv0, at_tv1}, {}, index32bit);
+  auto outputs = ke.run({at_tv0, at_tv1});
 
   auto at_output = at_tv0 + at_tv1;
   testValidate(
@@ -1447,10 +1447,10 @@ TEST_F(Tutorial, PointwiseBroadcastTMA) {
   at::Tensor at_tv0 = at::randn({dim1, dim2, dim3}, options);
   at::Tensor at_tv1 = at::randn({dim0, dim1, dim2, dim3}, options);
 
-  // Compile with FusionExecutor directly to avoid scheduling
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {at_tv0, at_tv1}, {}, index32bit);
-  auto outputs = fe.runFusion({at_tv0, at_tv1});
+  // Compile with KernelExecutor directly to avoid scheduling
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {at_tv0, at_tv1}, {}, index32bit);
+  auto outputs = ke.run({at_tv0, at_tv1});
 
   auto at_output = at_tv0 + at_tv1;
   testValidate(
@@ -1551,10 +1551,10 @@ TEST_F(Tutorial, TMABankConflictFreeTranspose) {
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   auto t = at::randn({10000, 10000}, options);
-  FusionExecutor fe;
+  KernelExecutor ke;
   CompileParams index32bit{DataType::Int32, 255, false};
-  fe.compileFusion(&fusion, {t}, {}, index32bit);
-  std::vector<at::Tensor> outputs = fe.runFusion({t});
+  ke.compile(&fusion, {t}, {}, index32bit);
+  std::vector<at::Tensor> outputs = ke.run({t});
   ASSERT_TRUE(at::equal(t.t(), outputs[0]));
 }
 
diff --git a/tests/cpp/test_unary.cpp b/tests/cpp/test_unary.cpp
index 9455230683b..76dfd789b39 100644
--- a/tests/cpp/test_unary.cpp
+++ b/tests/cpp/test_unary.cpp
@@ -57,13 +57,18 @@ TEST_P(UnaryTest, Neg) {
       in_tensor = at::randn(shape, options);
   }
 
-  FusionExecutorCache fec(std::move(fusion));
-  auto out_tensors = fec.runFusionWithInputs({in_tensor});
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
   // Calculate the reference output explicitly. Type promotion happens when
   // building the fusion, e.g., inside `neg`. Relying ExpresionEvaluator to
   // verify the result would hide type promotion errors.
   testValidate(
-      fec.fusion(), out_tensors, {in_tensor}, {-in_tensor}, __LINE__, __FILE__);
+      executor_cache.fusion(),
+      out_tensors,
+      {in_tensor},
+      {-in_tensor},
+      __LINE__,
+      __FILE__);
 }
 
 namespace {
diff --git a/tests/cpp/test_utils.cpp b/tests/cpp/test_utils.cpp
index 5215e58f19d..e04fd39ca43 100644
--- a/tests/cpp/test_utils.cpp
+++ b/tests/cpp/test_utils.cpp
@@ -1115,16 +1115,16 @@ TEST_F(NVFuserTest, FusionSASSDumpError) {
 
   at::Tensor t0 = at::randn({8}, options);
 
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
+  KernelExecutor ke;
+  ke.compile(&fusion, {t0});
 
   EXPECT_THAT(
-      [&]() { fe.disassembledKernelSASS(); },
+      [&]() { ke.disassembledKernelSASS(); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(
           ::testing::HasSubstr("I am fake")));
 
-  auto cg_outputs = fe.runFusion({t0});
-  testValidate(fe.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
+  auto cg_outputs = ke.run({t0});
+  testValidate(ke.kernel(), cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
 TEST_F(NVFuserTest, ProveLinearAndGetStride) {
diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp
index ca29c5fe1bf..64b1cbe55f7 100644
--- a/tests/cpp/utils.cpp
+++ b/tests/cpp/utils.cpp
@@ -24,15 +24,13 @@ CGResultsPackage scheduleAndRun(
     bool validate_scheduler) {
   auto heuristic_params = SchedulerEntry::scheduleWith(
       fusion, scheduler_type, runtime_inputs, validate_scheduler);
-  auto fusion_executor = std::make_unique<FusionExecutor>();
-  fusion_executor->compileFusion(
-      fusion, runtime_inputs, heuristic_params->lparams);
-  auto cg_outputs =
-      fusion_executor->runFusion(runtime_inputs, heuristic_params->lparams);
+  auto ke = std::make_unique<KernelExecutor>();
+  ke->compile(fusion, runtime_inputs, heuristic_params->lparams);
+  auto cg_outputs = ke->run(runtime_inputs, heuristic_params->lparams);
   CGResultsPackage results = {
       .outputs = cg_outputs,
       .heuristic_params = std::move(heuristic_params),
-      .fusion_executor = std::move(fusion_executor)};
+      .kernel_executor = std::move(ke)};
   return results;
 }
 
diff --git a/tests/cpp/utils.h b/tests/cpp/utils.h
index 648b85dbe55..d2964c8b731 100644
--- a/tests/cpp/utils.h
+++ b/tests/cpp/utils.h
@@ -40,12 +40,12 @@ namespace nvfuser {
 struct CGResultsPackage {
   std::vector<at::Tensor> outputs;
   std::unique_ptr<HeuristicParams> heuristic_params;
-  std::unique_ptr<FusionExecutor> fusion_executor;
+  std::unique_ptr<KernelExecutor> kernel_executor;
 };
 
 // Grabs heuristics and schedules with the provided scheduler type, compiles and
 // runs with Fuion executor, returns a struct containing the outputs,
-// heuristic_params, and FusionExecutor. These structures are for convenience in
+// heuristic_params, and KernelExecutor. These structures are for convenience in
 // testing. If validate_scheduler is set to false the scheduler check will still
 // be run but it will be ignored. Otherwise canScheduler returning false will
 // throw.
diff --git a/tests/python/utils.py b/tests/python/utils.py
index 2a7fadc4a14..9ff3f1b8d78 100644
--- a/tests/python/utils.py
+++ b/tests/python/utils.py
@@ -437,8 +437,10 @@ def exec_nvfuser(
         self.assertTrue(
             check_captured_python_definition(out, fd, inputs_captured, device)
         )
-
-        self.assertEqual(fc.num_fusions() - before_fusions, int(new_fusion_expected))
+        if not disable_serde:
+            self.assertEqual(
+                fc.num_fusions() - before_fusions, int(new_fusion_expected)
+            )
 
         if is_clonable:
             self.assertTrue(check_cpp_translation(out, fd, inputs_cloned))
diff --git a/tools/examples/repro.cpp b/tools/examples/repro.cpp
index a1e123dd3aa..53058ace4ab 100644
--- a/tools/examples/repro.cpp
+++ b/tools/examples/repro.cpp
@@ -103,7 +103,7 @@ TEST_F(NVFuserTest, FusionGeneratedTest_CUDA) {
     outputs.push_back(t32);
   }
 
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs(inputs);
+  KernelExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs(inputs);
   testValidate(fusion, cg_outputs, inputs, outputs, __LINE__, __FILE__);
 }