diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
index 03adbe1e7dd..8d7812288e3 100644
--- a/benchmarks/python/conftest.py
+++ b/benchmarks/python/conftest.py
@@ -4,7 +4,7 @@
 import pytest
 from .core import BENCHMARK_CONFIG
 from nvfuser.pytorch_utils import DEVICE_PROPERTIES
-
+from .global_params import DEFAULT_EXECUTORS
 
 def pytest_addoption(parser):
     parser.addoption(
@@ -104,20 +104,18 @@ def pytest_collection_modifyitems(session, config, items):
 
     from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
 
-    executors = ["eager", "torchcompile", "thunder"]
-
     def get_test_executor(item) -> str | None:
         if hasattr(item, "callspec") and "executor" in item.callspec.params:
             test_executor = item.callspec.params["executor"]
             assert (
-                test_executor in executors
+                test_executor in DEFAULT_EXECUTORS
             ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
             return test_executor
         return None
 
     executors_to_skip = []
 
-    for executor in executors:
+    for executor in DEFAULT_EXECUTORS:
         if not config.getoption(f"--benchmark-{executor}"):
             executors_to_skip.append(executor)
 
diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
index ab15dd11b66..86139f67a38 100644
--- a/benchmarks/python/core.py
+++ b/benchmarks/python/core.py
@@ -12,7 +12,7 @@
 import warnings
 import thunder
 from thunder.executors.nvfuserex import nvfuserex
-
+from .global_params import DEFAULT_EXECUTORS
 
 # These variables can be overwritten through CLI commands
 # --benchmark-rounds=rounds --benchmark-warmup-rounds=warmup_rounds
@@ -47,7 +47,7 @@ def unary_bwd_torch(inputs: List):  # [output, grad_out]
     inputs[0].backward(inputs[1], retain_graph=True)
 
 def with_executor(executor: str, fwd_fn: Callable) -> Callable:
-    assert executor in ["eager", "torchcompile", "thunder"]
+    assert executor in DEFAULT_EXECUTORS
     if executor == 'eager':
         return fwd_fn
     if executor == 'torchcompile':
@@ -325,6 +325,9 @@ def run_benchmark(
     def setup():
         clear_l2_cache()
         if device == "cuda":
+            for inp in inputs:
+                if isinstance(inp, torch.Tensor):
+                    inp.grad = None
             return [inputs], {}
 
         # Device = 'host'
diff --git a/benchmarks/python/global_params.py b/benchmarks/python/global_params.py
index 2d52d4d96d2..df2f3da1d56 100644
--- a/benchmarks/python/global_params.py
+++ b/benchmarks/python/global_params.py
@@ -26,6 +26,8 @@
 # Datatypes that will be promoted to Datatype.Float in Fusion Definitions
 PROMOTE_DTYPES = [DataType.BFloat16, DataType.Half]
 
+#Default executors
+DEFAULT_EXECUTORS = ["eager", "torchcompile", "thunder"]
 # Model Parameters from LLMs (GPT2/3, PaLM, LLama)
 
 # Embedding size: d_model, d_ff = 4 * d_model
diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
index a4f72242f4f..0da1e95ffe8 100644
--- a/benchmarks/python/normalization.py
+++ b/benchmarks/python/normalization.py
@@ -501,6 +501,6 @@ def norm_bwd_baseline_benchmark(
     run_benchmark(
         benchmark,
         unary_bwd_torch,
-        [outputs, grads],
+        [outputs, grads, *fwd_inputs],
         iobytes=norm_bwd_iobytes(size, dtype, norm),
     )
diff --git a/benchmarks/python/test_batchnorm_bwd.py b/benchmarks/python/test_batchnorm_bwd.py
index 0a1cd64cc57..3fb83e91276 100644
--- a/benchmarks/python/test_batchnorm_bwd.py
+++ b/benchmarks/python/test_batchnorm_bwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, DEFAULT_EXECUTORS
 from .normalization import norm_bwd_nvf_benchmark, norm_bwd_baseline_benchmark
 
 
@@ -31,7 +31,7 @@ def test_batchnorm_bwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
diff --git a/benchmarks/python/test_batchnorm_fwd.py b/benchmarks/python/test_batchnorm_fwd.py
index af197ce6f1b..921e42bc6d1 100644
--- a/benchmarks/python/test_batchnorm_fwd.py
+++ b/benchmarks/python/test_batchnorm_fwd.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, DEFAULT_EXECUTORS
 from .normalization import norm_fwd_nvf_benchmark, norm_fwd_baseline_benchmark
 
 
@@ -31,7 +31,7 @@ def test_batchnorm_fwd_nvf_benchmark(
     )
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("channels_last", [True, False])
diff --git a/benchmarks/python/test_broadcast_add_fwd.py b/benchmarks/python/test_broadcast_add_fwd.py
index dcedb5a0ded..b1073b9537a 100644
--- a/benchmarks/python/test_broadcast_add_fwd.py
+++ b/benchmarks/python/test_broadcast_add_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 
 
 def bcast_add_fusion(
@@ -88,7 +88,7 @@ def test_bcast_add_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [bias, x])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("bcast_axis", [0, 1], ids=["outer", "inner"])
diff --git a/benchmarks/python/test_dropout_layernorm_bwd.py b/benchmarks/python/test_dropout_layernorm_bwd.py
index 476381366f4..b091213faa5 100644
--- a/benchmarks/python/test_dropout_layernorm_bwd.py
+++ b/benchmarks/python/test_dropout_layernorm_bwd.py
@@ -12,7 +12,7 @@
     with_executor
 )
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 from .torch_ops import dropout_layernorm
 
 def dropout_layernorm_bwd_fusion(
@@ -190,7 +190,7 @@ def test_dropout_layernorm_bwd_nvf_benchmark(
         )
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_layernorm_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_dropout_layernorm_fwd.py b/benchmarks/python/test_dropout_layernorm_fwd.py
index c0e8a3e0e68..bfdff4b1545 100644
--- a/benchmarks/python/test_dropout_layernorm_fwd.py
+++ b/benchmarks/python/test_dropout_layernorm_fwd.py
@@ -11,7 +11,7 @@
     with_executor
 )
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 from .torch_ops import dropout_layernorm
 
 def dropout_layernorm_fwd_fusion(
@@ -150,7 +150,7 @@ def test_dropout_layernorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_layernorm_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_dropout_rmsnorm_bwd.py b/benchmarks/python/test_dropout_rmsnorm_bwd.py
index e7267764463..c51da11f107 100644
--- a/benchmarks/python/test_dropout_rmsnorm_bwd.py
+++ b/benchmarks/python/test_dropout_rmsnorm_bwd.py
@@ -12,7 +12,7 @@
     with_executor
 )
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 from .torch_ops import dropout_rmsnorm
 
 def dropout_rmsnorm_bwd_fusion(
@@ -170,7 +170,7 @@ def test_dropout_rmsnorm_bwd_nvf_benchmark(
         )
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_rmsnorm_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_dropout_rmsnorm_fwd.py b/benchmarks/python/test_dropout_rmsnorm_fwd.py
index 06e7629223e..52cb57a0982 100644
--- a/benchmarks/python/test_dropout_rmsnorm_fwd.py
+++ b/benchmarks/python/test_dropout_rmsnorm_fwd.py
@@ -11,7 +11,7 @@
     with_executor
 )
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 from .torch_ops import dropout_rmsnorm
 
 def dropout_rmsnorm_fwd_fusion(
@@ -140,7 +140,7 @@ def test_dropout_rmsnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [input1, input2, weights])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_dropout_rmsnorm_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_gelu_bwd.py b/benchmarks/python/test_gelu_bwd.py
index 3effbf92f74..f3cbdbfd56b 100644
--- a/benchmarks/python/test_gelu_bwd.py
+++ b/benchmarks/python/test_gelu_bwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import gelu
 
@@ -88,7 +88,7 @@ def test_gelu_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, bias])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_gelu_bwd_reduction.py b/benchmarks/python/test_gelu_bwd_reduction.py
index 7ade02c1f04..790ce1c31c2 100644
--- a/benchmarks/python/test_gelu_bwd_reduction.py
+++ b/benchmarks/python/test_gelu_bwd_reduction.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 
 
diff --git a/benchmarks/python/test_gelu_fwd.py b/benchmarks/python/test_gelu_fwd.py
index 3d1201049e6..fcd5f571d0f 100644
--- a/benchmarks/python/test_gelu_fwd.py
+++ b/benchmarks/python/test_gelu_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 from .torch_ops import gelu
 
 def gelu_fwd_fusion(
@@ -63,7 +63,7 @@ def test_gelu_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_gelu_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_groupnorm_fwd.py b/benchmarks/python/test_groupnorm_fwd.py
index e9151767149..a7d66754356 100644
--- a/benchmarks/python/test_groupnorm_fwd.py
+++ b/benchmarks/python/test_groupnorm_fwd.py
@@ -8,7 +8,7 @@
 import torch
 import thunder
 from thunder.executors.nvfuserex import nvfuserex
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 
 
 def get_n_groups(C):
@@ -128,7 +128,7 @@ def test_groupnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [x, weight, bias])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile", "thunder"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=4))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_groupnorm_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_huggingface_attn_bwd.py b/benchmarks/python/test_huggingface_attn_bwd.py
index 196c00a5fae..7fb7f8a4e06 100644
--- a/benchmarks/python/test_huggingface_attn_bwd.py
+++ b/benchmarks/python/test_huggingface_attn_bwd.py
@@ -107,7 +107,7 @@ def test_huggingface_attn_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_huggingface_attn_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_huggingface_attn_fwd.py b/benchmarks/python/test_huggingface_attn_fwd.py
index 7ea84395343..d084e8e5f79 100644
--- a/benchmarks/python/test_huggingface_attn_fwd.py
+++ b/benchmarks/python/test_huggingface_attn_fwd.py
@@ -135,7 +135,7 @@ def test_huggingface_attn_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [attention_mask, inputs])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_huggingface_attn_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_layernorm_bwd.py b/benchmarks/python/test_layernorm_bwd.py
index 27e4c5a4b9f..4d4e033f153 100644
--- a/benchmarks/python/test_layernorm_bwd.py
+++ b/benchmarks/python/test_layernorm_bwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import layernorm
 
@@ -147,7 +147,7 @@ def test_layernorm_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, grads, mean, invstd, weights])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_layernorm_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_layernorm_fwd.py b/benchmarks/python/test_layernorm_fwd.py
index 4f9fd112a56..64eadfe5890 100644
--- a/benchmarks/python/test_layernorm_fwd.py
+++ b/benchmarks/python/test_layernorm_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import layernorm
 
@@ -97,7 +97,7 @@ def test_layernorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_layernorm_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_nanogpt_attn_bwd.py b/benchmarks/python/test_nanogpt_attn_bwd.py
index f7cc5593667..356966ef34e 100644
--- a/benchmarks/python/test_nanogpt_attn_bwd.py
+++ b/benchmarks/python/test_nanogpt_attn_bwd.py
@@ -124,7 +124,7 @@ def test_nanogpt_attn_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, attn, dropout_mask, bias_mask])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_nanogpt_attn_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_nanogpt_attn_fwd.py b/benchmarks/python/test_nanogpt_attn_fwd.py
index f8f4f69aef1..06b74603c0b 100644
--- a/benchmarks/python/test_nanogpt_attn_fwd.py
+++ b/benchmarks/python/test_nanogpt_attn_fwd.py
@@ -126,7 +126,7 @@ def test_nanogpt_attn_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, bias])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_attn_inputs())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_nanogpt_attn_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_pointwise_mul.py b/benchmarks/python/test_pointwise_mul.py
index 2689f73ec7e..4a3372d9eac 100644
--- a/benchmarks/python/test_pointwise_mul.py
+++ b/benchmarks/python/test_pointwise_mul.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 
 
 def pointwise_mul_fusion(
@@ -50,7 +50,7 @@ def test_pointwise_mul_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_pointwise_mul_baseline_benchmark(
diff --git a/benchmarks/python/test_reduction.py b/benchmarks/python/test_reduction.py
index 4b8596c2f13..2ee667a5b2c 100644
--- a/benchmarks/python/test_reduction.py
+++ b/benchmarks/python/test_reduction.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 
 
 def reduction_fusion(
@@ -53,7 +53,7 @@ def test_reduction_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
diff --git a/benchmarks/python/test_reduction_epilogue.py b/benchmarks/python/test_reduction_epilogue.py
index ce5e1961d63..30f1a331f23 100644
--- a/benchmarks/python/test_reduction_epilogue.py
+++ b/benchmarks/python/test_reduction_epilogue.py
@@ -7,7 +7,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 
 # test the influence of epilogue on the performance of reduction.
 # current reduction scheduler only allows epilogue to be fused with outer reduction without post reduction broadcast.
@@ -67,7 +67,7 @@ def test_reduction_epilogue_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [x, epilogue])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0])
diff --git a/benchmarks/python/test_rmsnorm_bwd.py b/benchmarks/python/test_rmsnorm_bwd.py
index af9b1836b85..ab885c00803 100644
--- a/benchmarks/python/test_rmsnorm_bwd.py
+++ b/benchmarks/python/test_rmsnorm_bwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import rmsnorm
 
@@ -112,7 +112,7 @@ def test_rmsnorm_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, rms_eps, grads, weights])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_rmsnorm_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_rmsnorm_fwd.py b/benchmarks/python/test_rmsnorm_fwd.py
index 63469fa7995..61e8b81467d 100644
--- a/benchmarks/python/test_rmsnorm_fwd.py
+++ b/benchmarks/python/test_rmsnorm_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import rmsnorm
 
@@ -80,7 +80,7 @@ def test_rmsnorm_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [inputs, weights])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_rmsnorm_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_scale_bias_relu_bwd.py b/benchmarks/python/test_scale_bias_relu_bwd.py
index d057323bf64..a3656cc9f3c 100644
--- a/benchmarks/python/test_scale_bias_relu_bwd.py
+++ b/benchmarks/python/test_scale_bias_relu_bwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import scale_bias_relu
 
@@ -79,7 +79,7 @@ def test_sbr_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [scale, bool_mask, grads])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_sbr_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_scale_bias_relu_fwd.py b/benchmarks/python/test_scale_bias_relu_fwd.py
index ab77adb7f78..70e07c13165 100644
--- a/benchmarks/python/test_scale_bias_relu_fwd.py
+++ b/benchmarks/python/test_scale_bias_relu_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import scale_bias_relu
 
@@ -78,7 +78,7 @@ def test_sbr_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [bias, scale, inputs])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_sbr_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_silu_mul_bwd.py b/benchmarks/python/test_silu_mul_bwd.py
index ff06670e643..76738319dc4 100644
--- a/benchmarks/python/test_silu_mul_bwd.py
+++ b/benchmarks/python/test_silu_mul_bwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import silu_mul
 
@@ -79,7 +79,7 @@ def test_silu_mul_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [grads, x, y])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_silu_mul_bwd_baseline_benchmark(
diff --git a/benchmarks/python/test_silu_mul_fwd.py b/benchmarks/python/test_silu_mul_fwd.py
index b0e520ca069..fe7b3719b72 100644
--- a/benchmarks/python/test_silu_mul_fwd.py
+++ b/benchmarks/python/test_silu_mul_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 from .torch_ops import silu_mul
 
 def silu_mul_fwd_fusion(fd: FusionDefinition, dtype: DataType):
@@ -56,7 +56,7 @@ def test_silu_mul_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 def test_silu_mul_fwd_baseline_benchmark(
diff --git a/benchmarks/python/test_softmax_bwd.py b/benchmarks/python/test_softmax_bwd.py
index e1d9f4465f5..4c281b63c45 100644
--- a/benchmarks/python/test_softmax_bwd.py
+++ b/benchmarks/python/test_softmax_bwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, unary_bwd_torch, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import softmax
 
@@ -91,7 +91,7 @@ def test_softmax_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
diff --git a/benchmarks/python/test_softmax_fwd.py b/benchmarks/python/test_softmax_fwd.py
index 273bd165957..93dce237e25 100644
--- a/benchmarks/python/test_softmax_fwd.py
+++ b/benchmarks/python/test_softmax_fwd.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype
 from .core import run_benchmark, clear_dynamo_cache, with_executor
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 import numpy as np
 from .torch_ops import softmax
 
@@ -77,7 +77,7 @@ def test_softmax_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index f8d10cb9932..251409934cf 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -6,7 +6,7 @@
 from nvfuser.pytorch_utils import torch_dtype_to_nvfuser_dtype, with_executor
 from .core import run_benchmark, clear_dynamo_cache
 import torch
-from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES
+from .global_params import generate_input_sizes, FLOAT_DTYPES, PROMOTE_DTYPES, DEFAULT_EXECUTORS
 
 
 def transpose_fusion(
@@ -74,7 +74,7 @@ def test_transpose_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, [input1, input2])
 
 
-@pytest.mark.parametrize("executor", ["eager", "torchcompile"])
+@pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
 @pytest.mark.parametrize("size", generate_input_sizes(dims=3))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)])