diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8a05c3aaa26..11e58cd776d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,7 +16,7 @@ env:
 
 jobs:
   clang-build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
@@ -37,7 +37,7 @@ jobs:
           python setup.py build
 
   dynamic-type-meson:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 47e30615be0..f20b1a88ae1 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -15,7 +15,7 @@ env:
 
 jobs:
   check-license:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
@@ -28,7 +28,7 @@ jobs:
           test ! -s missing-header-files.txt
 
   clang-tidy:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
@@ -72,7 +72,7 @@ jobs:
           git --no-pager diff --diff-filter=d --name-only $head_commit | grep -e "csrc/.*\.cpp" -e "csrc/.*\.h" | xargs lintrunner --take CLANGTIDY --force-color
 
   lintrunner:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/nvfuser-ci-trigger.yml b/.github/workflows/nvfuser-ci-trigger.yml
index d4a79b95a9c..1175a89b640 100644
--- a/.github/workflows/nvfuser-ci-trigger.yml
+++ b/.github/workflows/nvfuser-ci-trigger.yml
@@ -15,9 +15,34 @@ jobs:
       args: ${{ env.args }}
 
     # This job only runs for pull request comments
-    if: |
-         ( startsWith(github.event.comment.body, '!build') || startsWith(github.event.comment.body, '!test') ) &&
-         (github.actor == 'xwang233' || github.actor == 'jjsjann123' || github.actor == 'chang-l' || github.actor == 'csarofeen' || github.actor == 'drzejan2' || github.actor == 'IvanYashchuk' || github.actor == 'jacobhinkle' || github.actor == 'kevinstephano' || github.actor == 'liqiangxl' || github.actor == 'mmigdal-nv' || github.actor == 'naoyam' || github.actor == 'ptrblck' || github.actor == 'rdspring1' || github.actor == 'samnordmann' || github.actor == 'zasdfgbnm' || github.actor == 'crcrpar' || github.actor == 'nWEIdia' || github.actor == 'Priya2698' || github.actor == 'wujingyue' || github.actor == 'tfogal' || github.actor == 'protonu' || github.actor == 'cowanmeg' || github.actor == 'nsarka')
+    if: >-
+          ( startsWith(github.event.comment.body, '!build') ||
+            startsWith(github.event.comment.body, '!test')
+          ) &&
+          ( github.actor == 'xwang233' || 
+            github.actor == 'jjsjann123' || 
+            github.actor == 'chang-l' || 
+            github.actor == 'csarofeen' || 
+            github.actor == 'drzejan2' || 
+            github.actor == 'IvanYashchuk' || 
+            github.actor == 'jacobhinkle' || 
+            github.actor == 'kevinstephano' || 
+            github.actor == 'liqiangxl' || 
+            github.actor == 'mmigdal-nv' || 
+            github.actor == 'naoyam' || 
+            github.actor == 'ptrblck' || 
+            github.actor == 'rdspring1' || 
+            github.actor == 'samnordmann' || 
+            github.actor == 'zasdfgbnm' || 
+            github.actor == 'crcrpar' || 
+            github.actor == 'nWEIdia' || 
+            github.actor == 'Priya2698' || 
+            github.actor == 'wujingyue' || 
+            github.actor == 'tfogal' || 
+            github.actor == 'protonu' || 
+            github.actor == 'cowanmeg' || 
+            github.actor == 'nsarka'
+          )
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index d27de8781ed..42450c8ba4b 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -7,6 +7,10 @@ name: pull
 on:
   pull_request:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id}}
+  cancel-in-progress: true
+
 run-name: CI status hello ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.head.sha }}
 jobs:
   status_hello:
@@ -23,3 +27,47 @@ jobs:
           -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
           https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
           -d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
+
+  pr-agent-tools:
+    name: PR Agent tools
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+      packages: read
+    container:
+      image: ghcr.io/nvidia/fuser:ci-llm-workflow
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      env:
+        GITHUB__USER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        CONFIG__PUBLISH_OUTPUT: true
+
+        OPENAI__KEY: ${{ secrets.LLM_OPENAI__KEY }}
+        OPENAI__API_BASE: ${{ secrets.LLM_OPENAI__API_BASE }}
+        CONFIG__MODEL: ${{ secrets.LLM_CONFIG__MODEL }}
+        CONFIG__CUSTOM_MODEL_MAX_TOKENS: 131072
+
+        CONFIG__MAX_MODEL_TOKENS: 65536
+        CONFIG__PUBLISH_OUTPUT_PROGRESS: false 
+
+        PR_REVIEWER__REQUIRE_SCORE_REVIEW: false
+        PR_REVIEWER__REQUIRE_TESTS_REVIEW: true
+        PR_REVIEWER__REQUIRE_ESTIMATE_EFFORT_TO_REVIEW: true
+        PR_REVIEWER__REQUIRE_CAN_BE_SPLIT_REVIEW: false
+        PR_REVIEWER__REQUIRE_SECURITY_REVIEW: false
+        PR_REVIEWER__REQUIRE_TICKET_ANALYSIS_REVIEW: false
+
+        PR_REVIEWER__ENABLE_REVIEW_LABELS_EFFORT: false
+        PR_REVIEWER__ENABLE_REVIEW_LABELS_SECURITY: false
+
+        PR_REVIEWER__PERSISTENT_COMMENT: true
+        PR_REVIEWER__FINAL_UPDATE_MESSAGE: false
+
+        PR_REVIEWER__EXTRA_INSTRUCTIONS: |
+          Focus on potential logic change, especially on changes to function signatures.
+
+    steps:
+      - name: PR Agent review
+        run: python /app/pr_agent/cli.py --pr_url ${{ github.event.pull_request.html_url }} review
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1d10720550..b2fc8cc420d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -240,6 +240,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/scheduler/tools/loop_domain_scheduler.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/tools/maxinfo_propagator.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/tools/resize_utils.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/tools/static_repeat.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
@@ -446,6 +447,7 @@ if(BUILD_PYTHON)
   # nvfuser python API sources
   set(NVFUSER_PYTHON_SRCS)
   list(APPEND NVFUSER_PYTHON_SRCS
+    ${NVFUSER_SRCS_DIR}/python_frontend/communicator_bindings.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/python_bindings_extension.cpp
     ${NVFUSER_SRCS_DIR}/python_frontend/schedule_bindings.cpp
@@ -698,7 +700,12 @@ if(BUILD_TEST)
   add_test(tutorial "${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp" "")
   list(APPEND TEST_BINARIES tutorial)
 
-  add_test(test_host_ir "${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp" "")
+  set(HOSTIR_TEST_SRCS)
+  list(APPEND HOSTIR_TEST_SRCS
+    ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
+  )
+  add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_host_ir)
 
   if(BUILD_PYTHON)
diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp
index f0c99f830f8..4d069b3ec47 100644
--- a/benchmarks/cpp/utils.cpp
+++ b/benchmarks/cpp/utils.cpp
@@ -190,14 +190,14 @@ int64_t runBenchmarkIterations(
               ->groups()
               .size() > 1;
 
-  const auto& compile_log = executor_cache->getMostRecentExecutorInfo();
-  auto params = toString(compile_log.params);
-  auto lparams = toString(
-      compile_log.fusion_executor->as<KernelExecutor>()->lastLaunchParams());
   // Only set if not segmented. In the case of segmented fusions,
   // this could be confusing as the log would refect only the last
   // segment. Revisit if necessary.
   if (!segmented) {
+    const auto& compile_log = executor_cache->getMostRecentExecutorInfo();
+    auto params = toString(compile_log.params);
+    auto lparams = toString(
+        compile_log.fusion_executor->as<KernelExecutor>()->lastLaunchParams());
     benchmark_state.SetLabel(params + lparams);
   }
 
diff --git a/benchmarks/python/conftest.py b/benchmarks/python/conftest.py
index 03adbe1e7dd..eb5237eab86 100644
--- a/benchmarks/python/conftest.py
+++ b/benchmarks/python/conftest.py
@@ -32,6 +32,11 @@ def pytest_addoption(parser):
         action="store_true",
         help="Benchmarks torch.compile mode.",
     )
+    parser.addoption(
+        "--benchmark-thunder-torchcompile",
+        action="store_true",
+        help="Benchmarks torch.compile mode.",
+    )
 
     # pytest-benchmark does not have CLI options to set rounds/warmup_rounds for benchmark.pedantic.
     # The following two options are used to overwrite the default values through CLI.
@@ -104,14 +109,14 @@ def pytest_collection_modifyitems(session, config, items):
 
     from nvfuser.pytorch_utils import retry_on_oom_or_skip_test
 
-    executors = ["eager", "torchcompile", "thunder"]
+    executors = ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
 
     def get_test_executor(item) -> str | None:
         if hasattr(item, "callspec") and "executor" in item.callspec.params:
             test_executor = item.callspec.params["executor"]
             assert (
                 test_executor in executors
-            ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', found {test_executor}."
+            ), f"Expected executor to be one of 'eager', 'torchcompile', 'thunder', 'thunder-torchcompile', found {test_executor}."
             return test_executor
         return None
 
diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
index aea3662c5cb..ecf047f79b1 100644
--- a/benchmarks/python/core.py
+++ b/benchmarks/python/core.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
+from collections.abc import Iterable
 import pytest_benchmark
 import torch
 from torch.autograd import DeviceType
@@ -47,14 +48,18 @@ def unary_bwd_torch(inputs: List):  # [output, grad_out]
     inputs[0].backward(inputs[1], retain_graph=True)
 
 
-def with_executor(executor: str, fwd_fn: Callable) -> Callable:
-    assert executor in ["eager", "torchcompile", "thunder"]
+def with_executor(executor: str, fwd_fn: Callable, **kwargs) -> Callable:
+    assert executor in ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
     if executor == "eager":
         return fwd_fn
     if executor == "torchcompile":
-        return torch.compile(fwd_fn)
+        return torch.compile(fwd_fn, **kwargs)
     if executor == "thunder":
-        return thunder.jit(fwd_fn, nv_enable_bookend=False, executors=[nvfuserex])
+        return thunder.jit(
+            fwd_fn, nv_enable_bookend=False, executors=[nvfuserex], **kwargs
+        )
+    if executor == "thunder-torchcompile":
+        return thunder.jit(fwd_fn, executors=["torchcompile"], **kwargs)
 
 
 def compute_total_iobytes(
@@ -221,9 +226,9 @@ def set_metrics(
             % Peak Bandwidth (SOL): 100 * Bandwidth /PEAK_BANDWIDTH
         """
         if not iobytes:
-            if isinstance(inputs, torch.Tensor):
+            if not isinstance(inputs, Iterable):
                 inputs = [inputs]
-            if isinstance(outputs, torch.Tensor):
+            if not isinstance(outputs, Iterable):
                 outputs = [outputs]
 
             iobytes = 0
diff --git a/benchmarks/python/normalization.py b/benchmarks/python/normalization.py
index a4f72242f4f..8639b9b2fd1 100644
--- a/benchmarks/python/normalization.py
+++ b/benchmarks/python/normalization.py
@@ -492,7 +492,6 @@ def norm_bwd_baseline_benchmark(
 
     norm_fwd_fn = batchnorm_fwd_fn if norm == "batch_norm" else instancenorm_fwd_fn
 
-    # Compile the fwd fn for torchcompile
     fwd_fn = with_executor(executor, norm_fwd_fn)
     fwd_inputs = [inputs, weight, bias, running_mean, running_var]
     outputs = fwd_fn(fwd_inputs)
diff --git a/benchmarks/python/rope_ops.py b/benchmarks/python/rope_ops.py
new file mode 100644
index 00000000000..344d12d272b
--- /dev/null
+++ b/benchmarks/python/rope_ops.py
@@ -0,0 +1,998 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+import torch
+
+from torch import nn
+
+from typing import Tuple
+from functools import partial
+
+
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    head_size = x.size(-1)
+    x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+    x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
+    rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+    if cos.dim() > 1:
+        # batch dimensions must align
+        # sin/cos are (B, T, hs) so we unsqeeze -3 for nh
+        # we count from back because all of apply_rope does
+        cos = cos.unsqueeze(-3)
+        sin = sin.unsqueeze(-3)
+
+    roped = (x * cos) + (rotated * sin)
+    return roped.to(dtype=x.dtype)
+
+
+def llama_hf_rope(config_str):
+    class Config:
+        def __init__(
+            self, n_head, head_size, n_query_groups, rope_n_elem, batches, seq_length
+        ):
+            self.n_head = n_head
+            self.head_size = head_size
+            self.n_query_groups = n_query_groups
+            self.rope_n_elem = rope_n_elem
+            self.batches = batches
+            self.seq_length = seq_length
+
+    class LitGPTRope(torch.nn.Module):
+        def __init__(self, config):
+            super(LitGPTRope, self).__init__()
+            self.config = config
+
+        def forward(self, qkv, cos, sin):
+            B, T, _ = qkv.size()
+            # assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`)
+            q_per_kv = self.config.n_head // self.config.n_query_groups
+            total_qkv = q_per_kv + 2  # each group has 1+ queries, 1 key, and 1 value
+            qkv = qkv.view(
+                B, T, self.config.n_query_groups, total_qkv, self.config.head_size
+            )
+            qkv = qkv.permute(0, 2, 3, 1, 4)  # (B, n_query_groups, total_qkv, T, hs)
+
+            # split batched computation into three
+            q, k, v = qkv.split((q_per_kv, 1, 1), dim=2)
+
+            # maybe repeat k and v if for the non multi-head attention cases
+            # training: flash attention requires it
+            # inference: multi-query would require a full kv cache so avoid it to limit its memory usage
+            # if self.config.n_query_groups != self.config.n_head and (input_pos is None or self.config.n_query_groups != 1):
+            if self.config.n_query_groups != self.config.n_head and (
+                self.config.n_query_groups != 1
+            ):
+                k = k.expand(
+                    B, self.config.n_query_groups, q_per_kv, T, self.config.head_size
+                )
+                v = v.expand(
+                    B, self.config.n_query_groups, q_per_kv, T, self.config.head_size
+                )
+
+            q = q.reshape(B, -1, T, self.config.head_size)  # (B, nh_q, T, hs)
+            k = k.reshape(B, -1, T, self.config.head_size)  # (B, nh_k, T, hs)
+            v = v.reshape(B, -1, T, self.config.head_size)  # (B, nh_v, T, hs)
+
+            q_roped = apply_rope(q[..., : self.config.rope_n_elem], cos, sin)
+            k_roped = apply_rope(k[..., : self.config.rope_n_elem], cos, sin)
+            q = torch.cat((q_roped, q[..., self.config.rope_n_elem :]), dim=-1)
+            k = torch.cat((k_roped, k[..., self.config.rope_n_elem :]), dim=-1)
+            return q, k
+
+    configs = {}
+    configs["llama_2_7b_hf_rope"] = Config(
+        n_head=32,
+        head_size=128,
+        n_query_groups=32,
+        rope_n_elem=128,
+        batches=2,
+        seq_length=4096,
+    )
+    configs["llama_3_8B_rope"] = Config(
+        n_head=32,
+        head_size=128,
+        n_query_groups=8,
+        rope_n_elem=128,
+        batches=2,
+        seq_length=8192,
+    )
+
+    cfg = configs[config_str]
+
+    def inputs():
+        qkv = torch.randn(
+            cfg.batches,
+            cfg.seq_length,
+            cfg.head_size * (cfg.n_head + 2 * cfg.n_query_groups),
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        cos = torch.randn(
+            cfg.seq_length,
+            cfg.rope_n_elem,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=False,
+        )
+        sin = torch.randn(
+            cfg.seq_length,
+            cfg.rope_n_elem,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=False,
+        )
+        return qkv, cos, sin
+
+    def grads():
+        grad = torch.randn(
+            cfg.batches,
+            cfg.n_head,
+            cfg.seq_length,
+            cfg.head_size,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=False,
+        )
+        return grad
+
+    # Manual IOBytes computes the total bandwidth for thunder backward trace.
+    def iobytes():
+        n_elements = 0
+        # adding size of q.grad + k.grad
+        n_elements += 2 * cfg.batches * cfg.n_head * cfg.seq_length * cfg.head_size
+        # adding size of cos, sin
+        n_elements += 2 * cfg.seq_length * cfg.rope_n_elem
+        # adding size of qkv.grad
+        n_elements += (
+            cfg.batches
+            * cfg.seq_length
+            * cfg.head_size
+            * (cfg.n_head + 2 * cfg.n_query_groups)
+        )
+        # scale by dtype size
+        return n_elements * torch.bfloat16.itemsize
+
+    return LitGPTRope(cfg).cuda().bfloat16(), inputs, grads, iobytes
+
+
+def hf_qwen2_rope():
+    import json
+    from transformers.models.qwen2 import Qwen2Config
+
+    qwen_cfg_str = r"""{
+      "_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+      "architectures": [
+        "Qwen2ForCausalLM"
+      ],
+      "attention_dropout": 0.0,
+      "bos_token_id": 151643,
+      "eos_token_id": 151645,
+      "hidden_act": "silu",
+      "hidden_size": 3584,
+      "initializer_range": 0.02,
+      "intermediate_size": 18944,
+      "max_position_embeddings": 32768,
+      "max_window_layers": 28,
+      "model_type": "qwen2",
+      "num_attention_heads": 28,
+      "num_hidden_layers": 28,
+      "num_key_value_heads": 4,
+      "rms_norm_eps": 1e-06,
+      "rope_theta": 1000000.0,
+      "sliding_window": null,
+      "tie_word_embeddings": false,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.43.3",
+      "use_cache": true,
+      "use_sliding_window": false,
+      "vocab_size": 152064
+    }
+    """
+
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """
+        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+        """
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(
+            batch, num_key_value_heads, n_rep, slen, head_dim
+        )
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    class Qwen2Rope(nn.Module):
+        def __init__(self, config: Qwen2Config):
+            super().__init__()
+            self.config = config
+
+            self.hidden_size = config.hidden_size
+            self.num_heads = config.num_attention_heads
+            self.head_dim = self.hidden_size // self.num_heads
+            self.num_key_value_heads = config.num_key_value_heads
+            self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+            self.max_position_embeddings = config.max_position_embeddings
+            self.rope_theta = config.rope_theta
+            self.is_causal = True
+            self.attention_dropout = config.attention_dropout
+
+            if (self.head_dim * self.num_heads) != self.hidden_size:
+                raise ValueError(
+                    f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                    f" and `num_heads`: {self.num_heads})."
+                )
+
+        def forward(
+            self,
+            query_in_states: torch.Tensor,
+            key_in_states: torch.Tensor,
+            value_in_states: torch.Tensor,
+            cos: torch.Tensor,
+            sin: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            past_key_value = None
+            bsz, q_len, _ = query_in_states.size()
+
+            query_states = query_in_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            ).transpose(1, 2)
+            key_states = key_in_states.view(
+                bsz, q_len, self.num_key_value_heads, self.head_dim
+            ).transpose(1, 2)
+            value_states = value_in_states.view(
+                bsz, q_len, self.num_key_value_heads, self.head_dim
+            ).transpose(1, 2)
+
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin
+            )
+
+            if past_key_value is not None:
+                assert False
+
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+            return query_states, key_states, value_states
+
+    cfg = Qwen2Config.from_dict(json.loads(qwen_cfg_str))
+    cfg.batch_size = 1
+    cfg.seq_len = 4096
+
+    head_dim = cfg.hidden_size // cfg.num_attention_heads
+
+    def inputs():
+        q = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_attention_heads * head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        k = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_key_value_heads * head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        v = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_key_value_heads * head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        cos = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        sin = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        return q, k, v, cos, sin
+
+    def grads():
+        grad = torch.randn(
+            cfg.batch_size,
+            cfg.num_attention_heads,
+            cfg.seq_len,
+            head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=False,
+        )
+        return grad
+
+    # Manual IOBytes computes the total bandwidth for thunder backward trace.
+    def iobytes():
+        n_elements = 0
+        # adding size of query_states.grad + key_states.grad + value_states.grad
+        n_elements += (
+            3 * cfg.batch_size * cfg.num_attention_heads * cfg.seq_len * head_dim
+        )
+        # adding size of query_states + key_states
+        n_elements += (
+            2 * cfg.batch_size * cfg.num_attention_heads * cfg.seq_len * head_dim
+        )
+        # adding size of cos, sin
+        n_elements += 2 * cfg.batch_size * cfg.seq_len * head_dim
+        # adding size of q.grad
+        n_elements += cfg.batch_size * cfg.seq_len * cfg.num_attention_heads * head_dim
+        # adding size of k.grad, v.grad
+        n_elements += (
+            2 * cfg.batch_size * cfg.seq_len * cfg.num_key_value_heads * head_dim
+        )
+        # adding size of cos.grad, sin.grad
+        n_elements += 2 * cfg.batch_size * cfg.seq_len * head_dim
+        # scale by dtype size
+        return n_elements * torch.bfloat16.itemsize
+
+    return Qwen2Rope(cfg).cuda().bfloat16(), inputs, grads, iobytes
+
+
+def hf_phi3_rope():
+    import json
+    from transformers.models.phi3 import Phi3Config
+
+    phi35_cfg_str = r"""{
+      "_name_or_path": "microsoft/Phi-3.5-mini-instruct",
+      "architectures": [
+        "Phi3ForCausalLM"
+      ],
+      "attention_bias": false,
+      "attention_dropout": 0.0,
+      "auto_map": {
+        "AutoConfig": "microsoft/Phi-3.5-mini-instruct--configuration_phi3.Phi3Config",
+        "AutoModelForCausalLM": "microsoft/Phi-3.5-mini-instruct--modeling_phi3.Phi3ForCausalLM"
+      },
+      "bos_token_id": 1,
+      "embd_pdrop": 0.0,
+      "eos_token_id": 32000,
+      "hidden_act": "silu",
+      "hidden_size": 3072,
+      "initializer_range": 0.02,
+      "intermediate_size": 8192,
+      "max_position_embeddings": 131072,
+      "model_type": "phi3",
+      "num_attention_heads": 32,
+      "num_hidden_layers": 32,
+      "num_key_value_heads": 32,
+      "original_max_position_embeddings": 4096,
+      "pad_token_id": 32000,
+      "resid_pdrop": 0.0,
+      "rms_norm_eps": 1e-05,
+      "rope_scaling": {
+        "long_factor": [
+          1.0800000429153442,
+          1.1100000143051147,
+          1.1399999856948853,
+          1.340000033378601,
+          1.5899999141693115,
+          1.600000023841858,
+          1.6200000047683716,
+          2.620000123977661,
+          3.2300000190734863,
+          3.2300000190734863,
+          4.789999961853027,
+          7.400000095367432,
+          7.700000286102295,
+          9.09000015258789,
+          12.199999809265137,
+          17.670000076293945,
+          24.46000099182129,
+          28.57000160217285,
+          30.420001983642578,
+          30.840002059936523,
+          32.590003967285156,
+          32.93000411987305,
+          42.320003509521484,
+          44.96000289916992,
+          50.340003967285156,
+          50.45000457763672,
+          57.55000305175781,
+          57.93000411987305,
+          58.21000289916992,
+          60.1400032043457,
+          62.61000442504883,
+          62.62000274658203,
+          62.71000289916992,
+          63.1400032043457,
+          63.1400032043457,
+          63.77000427246094,
+          63.93000411987305,
+          63.96000289916992,
+          63.970001220703125,
+          64.02999877929688,
+          64.06999969482422,
+          64.08000183105469,
+          64.12000274658203,
+          64.41000366210938,
+          64.4800033569336,
+          64.51000213623047,
+          64.52999877929688,
+          64.83999633789062
+        ],
+       "short_factor": [
+          1.0,
+          1.0199999809265137,
+          1.0299999713897705,
+          1.0299999713897705,
+          1.0499999523162842,
+          1.0499999523162842,
+          1.0499999523162842,
+          1.0499999523162842,
+          1.0499999523162842,
+          1.0699999332427979,
+          1.0999999046325684,
+          1.1099998950958252,
+          1.1599998474121094,
+          1.1599998474121094,
+          1.1699998378753662,
+          1.2899998426437378,
+          1.339999794960022,
+          1.679999828338623,
+          1.7899998426437378,
+          1.8199998140335083,
+          1.8499997854232788,
+          1.8799997568130493,
+          1.9099997282028198,
+          1.9399996995925903,
+          1.9899996519088745,
+          2.0199997425079346,
+          2.0199997425079346,
+          2.0199997425079346,
+          2.0199997425079346,
+          2.0199997425079346,
+          2.0199997425079346,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0299997329711914,
+          2.0799996852874756,
+          2.0899996757507324,
+          2.189999580383301,
+          2.2199995517730713,
+          2.5899994373321533,
+          2.729999542236328,
+          2.749999523162842,
+          2.8399994373321533
+        ],
+        "type": "longrope"
+      },
+      "rope_theta": 10000.0,
+      "sliding_window": 262144,
+      "tie_word_embeddings": false,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.46.3",
+      "use_cache": true,
+      "vocab_size": 32064
+    }"""
+
+    class Phi3RotaryEmbedding(nn.Module):
+        def __init__(
+            self, dim, max_position_embeddings=2048, base=10000.0, device=None
+        ):
+            super().__init__()
+
+            self.dim = dim
+            self.max_position_embddings = max_position_embeddings
+            self.base = base
+
+            inv_freq = 1.0 / (
+                self.base
+                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)
+            )
+            self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+        @torch.no_grad()
+        def forward(self, x, position_ids, seq_len=None):
+            # x: [bs, num_attention_heads, seq_len, head_size]
+            self.inv_freq.to(x.device)
+            inv_freq_expanded = (
+                self.inv_freq[None, :, None]
+                .float()
+                .expand(position_ids.shape[0], -1, 1)
+            )
+            position_ids_expanded = position_ids[:, None, :].float()
+            # Force float32 since bfloat16 loses precision on long contexts
+            # See https://github.com/huggingface/transformers/pull/29285
+            device_type = x.device.type
+            device_type = (
+                device_type
+                if isinstance(device_type, str) and device_type != "mps"
+                else "cpu"
+            )
+            with torch.autocast(device_type=device_type, enabled=False):
+                freqs = (
+                    inv_freq_expanded.float() @ position_ids_expanded.float()
+                ).transpose(1, 2)
+                emb = torch.cat((freqs, freqs), dim=-1)
+                cos = emb.cos()
+                sin = emb.sin()
+            return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """
+        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+        """
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(
+            batch, num_key_value_heads, n_rep, slen, head_dim
+        )
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    class HfPhi3Rope(nn.Module):
+        """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+        def __init__(self, config: Phi3Config):
+            super().__init__()
+            self.config = config
+
+            self.hidden_size = config.hidden_size
+            self.num_heads = config.num_attention_heads
+            self.head_dim = self.hidden_size // self.num_heads
+            self.num_key_value_heads = config.num_key_value_heads
+            self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+            self.max_position_embeddings = config.max_position_embeddings
+            self.original_max_position_embeddings = (
+                config.original_max_position_embeddings
+            )
+            self.rope_theta = config.rope_theta
+            self.rope_scaling = config.rope_scaling
+            self.is_causal = True
+
+            if (self.head_dim * self.num_heads) != self.hidden_size:
+                raise ValueError(
+                    f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                    f" and `num_heads`: {self.num_heads})."
+                )
+
+            self.rotary_emb = Phi3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+
+        def forward(
+            self, qkv: torch.Tensor, position_ids: torch.Tensor
+        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            past_key_value = None
+            bsz, q_len, _ = qkv.size()
+
+            query_pos = self.num_heads * self.head_dim
+            query_states = qkv[..., :query_pos]
+            key_states = qkv[
+                ..., query_pos : query_pos + self.num_key_value_heads * self.head_dim
+            ]
+            value_states = qkv[
+                ..., query_pos + self.num_key_value_heads * self.head_dim :
+            ]
+
+            query_states = query_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            ).transpose(1, 2)
+            key_states = key_states.view(
+                bsz, q_len, self.num_key_value_heads, self.head_dim
+            ).transpose(1, 2)
+            value_states = value_states.view(
+                bsz, q_len, self.num_key_value_heads, self.head_dim
+            ).transpose(1, 2)
+
+            kv_seq_len = key_states.shape[-2]
+            if past_key_value is not None:
+                assert False
+            cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, position_ids
+            )
+
+            if past_key_value is not None:
+                assert False
+
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+            return query_states, key_states, value_states
+
+    cfg = Phi3Config.from_dict(json.loads(phi35_cfg_str))
+    cfg.batch_size = 1
+    cfg.seq_len = 8192
+    head_dim = cfg.hidden_size // cfg.num_attention_heads
+
+    def inputs():
+        qkv = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_attention_heads * head_dim
+            + 2 * (cfg.num_key_value_heads * head_dim),
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        position_ids = torch.arange(0, cfg.seq_len, device="cuda").unsqueeze(0)
+        return qkv, position_ids
+
+    def grads():
+        grad = torch.randn(
+            cfg.batch_size,
+            cfg.num_attention_heads,
+            cfg.seq_len,
+            head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=False,
+        )
+        return grad
+
+    # Manual IOBytes computes the total bandwidth for thunder backward trace.
+    def iobytes():
+        n_elements = 0
+        # adding size of query_states.grad + key_states.grad +  value_states.grad
+        n_elements += (
+            3 * cfg.batch_size * cfg.num_attention_heads * cfg.seq_len * head_dim
+        )
+        # adding size of qkv.grad
+        n_elements += (
+            cfg.batch_size
+            * cfg.seq_len
+            * (
+                cfg.num_attention_heads * head_dim
+                + 2 * (cfg.num_key_value_heads * head_dim)
+            )
+        )
+        # matmul output size
+        n_elements_matmul_out = head_dim / 2 * cfg.seq_len
+        # totoal io sizes
+        return (
+            n_elements * torch.bfloat16.itemsize
+            + n_elements_matmul_out * torch.float32.itemsize
+        )
+
+    return HfPhi3Rope(cfg).cuda().bfloat16(), inputs, grads, iobytes
+
+
+def hf_mistral_nemo_rope():
+    import json
+    from transformers.models.mistral import MistralConfig
+
+    mistral_cfg_str = r"""{
+      "_name_or_path": "mistralai/Mistral-Nemo-Base-2407",
+      "architectures": [
+        "MistralForCausalLM"
+      ],
+      "attention_dropout": 0.0,
+      "bos_token_id": 1,
+      "eos_token_id": 2,
+      "head_dim": 128,
+      "hidden_act": "silu",
+      "hidden_size": 5120,
+      "initializer_range": 0.02,
+      "intermediate_size": 14336,
+      "max_position_embeddings": 128000,
+      "model_type": "mistral",
+      "num_attention_heads": 32,
+      "num_hidden_layers": 40,
+      "num_key_value_heads": 8,
+      "rms_norm_eps": 1e-05,
+      "rope_theta": 1000000.0,
+      "sliding_window": null,
+      "tie_word_embeddings": false,
+      "torch_dtype": "bfloat16",
+      "transformers_version": "4.43.3",
+      "use_cache": true,
+      "vocab_size": 131072
+    }
+    """
+
+    class MistralRotaryEmbedding(nn.Module):
+        def __init__(
+            self, dim, max_position_embeddings=2048, base=10000.0, device=None
+        ):
+            super().__init__()
+
+            self.dim = dim
+            self.max_position_embeddings = max_position_embeddings
+            self.base = base
+            inv_freq = 1.0 / (
+                self.base
+                ** (
+                    torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device)
+                    / self.dim
+                )
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        @torch.no_grad()
+        # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward
+        # TODO(joao): add me back asap :)
+        def forward(self, x, position_ids):
+            # x: [bs, num_attention_heads, seq_len, head_size]
+            inv_freq_expanded = (
+                self.inv_freq[None, :, None]
+                .float()
+                .expand(position_ids.shape[0], -1, 1)
+            )
+            position_ids_expanded = position_ids[:, None, :].float()
+            # Force float32 since bfloat16 loses precision on long contexts
+            # See https://github.com/huggingface/transformers/pull/29285
+            device_type = x.device.type
+            device_type = (
+                device_type
+                if isinstance(device_type, str) and device_type != "mps"
+                else "cpu"
+            )
+            with torch.autocast(device_type=device_type, enabled=False):
+                freqs = (
+                    inv_freq_expanded.float() @ position_ids_expanded.float()
+                ).transpose(1, 2)
+                emb = torch.cat((freqs, freqs), dim=-1)
+                cos = emb.cos()
+                sin = emb.sin()
+            return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """
+        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+        """
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(
+            batch, num_key_value_heads, n_rep, slen, head_dim
+        )
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    class MistralNemoRope(nn.Module):
+        def __init__(self, config: MistralConfig):
+            super().__init__()
+            self.config = config
+
+            self.attention_dropout = config.attention_dropout
+            self.hidden_size = config.hidden_size
+            self.num_heads = config.num_attention_heads
+            self.head_dim = config.head_dim
+            self.num_key_value_heads = config.num_key_value_heads
+            self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+            self.max_position_embeddings = config.max_position_embeddings
+            self.rope_theta = config.rope_theta
+            self.is_causal = True
+
+            self.rotary_emb = MistralRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+
+        def forward(
+            self,
+            query_in_states: torch.Tensor,
+            key_in_states: torch.Tensor,
+            value_in_states: torch.Tensor,
+            position_ids: torch.Tensor,
+        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            past_key_value = None
+            bsz, q_len, _ = query_in_states.size()
+
+            query_states = query_in_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            ).transpose(1, 2)
+            key_states = key_in_states.view(
+                bsz, q_len, self.num_key_value_heads, self.head_dim
+            ).transpose(1, 2)
+            value_states = value_in_states.view(
+                bsz, q_len, self.num_key_value_heads, self.head_dim
+            ).transpose(1, 2)
+
+            cos, sin = self.rotary_emb(value_states, position_ids)
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin
+            )
+
+            if past_key_value is not None:
+                assert False
+
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+            return query_states, key_states, value_states
+
+    cfg = MistralConfig.from_dict(json.loads(mistral_cfg_str))
+    cfg.batch_size = 1
+    cfg.seq_len = 4096
+
+    head_dim = cfg.hidden_size // cfg.num_attention_heads
+
+    def inputs():
+        q = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_attention_heads * cfg.head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        k = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_key_value_heads * cfg.head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        v = torch.randn(
+            cfg.batch_size,
+            cfg.seq_len,
+            cfg.num_key_value_heads * cfg.head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=True,
+        )
+        position_ids = torch.arange(0, cfg.seq_len, device="cuda").unsqueeze(0)
+        return q, k, v, position_ids
+
+    def grads():
+        grad = torch.randn(
+            cfg.batch_size,
+            cfg.num_attention_heads,
+            cfg.seq_len,
+            cfg.head_dim,
+            device="cuda",
+            dtype=torch.bfloat16,
+            requires_grad=False,
+        )
+        return grad
+
+    # Manual IOBytes computes the total bandwidth for thunder backward trace.
+    def iobytes():
+        n_elements = 0
+        # adding size of query_states.grad + key_states.grad +  value_states.grad
+        n_elements += (
+            3 * cfg.batch_size * cfg.num_attention_heads * cfg.seq_len * cfg.head_dim
+        )
+        # adding size of q.grad
+        n_elements += (
+            cfg.batch_size * cfg.seq_len * cfg.num_attention_heads * cfg.head_dim
+        )
+        # adding size of k.grad, v.grad
+        n_elements += (
+            2 * cfg.batch_size * cfg.seq_len * cfg.num_key_value_heads * cfg.head_dim
+        )
+        # matmul output size
+        n_elements_matmul_out = head_dim / 2 * cfg.seq_len
+        # totoal io sizes
+        return (
+            n_elements * torch.bfloat16.itemsize
+            + n_elements_matmul_out * torch.float32.itemsize
+        )
+
+    return MistralNemoRope(cfg).cuda().bfloat16(), inputs, grads, iobytes
+
+
+# The setup returns a function that would setup benchmark by returning:
+#    fwd_model, inputs_fn, grads_fn, iobytes_fn
+rope_setup = {
+    "llama_2_7b_hf_rope": partial(llama_hf_rope, config_str="llama_2_7b_hf_rope"),
+    "llama_3_8B_rope": partial(llama_hf_rope, config_str="llama_3_8B_rope"),
+    "hf_qwen2_rope": hf_qwen2_rope,
+    "hf_phi3_rope": hf_phi3_rope,
+    "hf_mistral_nemo_rope": hf_mistral_nemo_rope,
+}
diff --git a/benchmarks/python/test_rope.py b/benchmarks/python/test_rope.py
index 5df43bdc6ea..72376e4e6b3 100644
--- a/benchmarks/python/test_rope.py
+++ b/benchmarks/python/test_rope.py
@@ -2,177 +2,83 @@
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 import pytest
-from nvfuser import FusionDefinition, DataType
-from .core import run_benchmark
-import torch
-
-
-# Mimic the Hugging Face implementation:
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L216
-def rope_with_cat_fusion(
-    fd: FusionDefinition,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    features_per_head: int,
-) -> None:
-    q = fd.define_tensor(
-        shape=[batch_size, seq_len, num_heads, features_per_head],
-        dtype=DataType.BFloat16,
-    )
-    cos = fd.define_tensor(
-        shape=[seq_len, features_per_head],
-        dtype=DataType.BFloat16,
-    )
-    sin = fd.define_tensor(
-        shape=[seq_len, features_per_head],
-        dtype=DataType.BFloat16,
-    )
-
-    q = fd.ops.permute(q, dims=[0, 2, 1, 3])
-    q_real = fd.ops.slice(
-        q,
-        start_indices=[0, 0, 0, 0],
-        end_indices=[batch_size, num_heads, seq_len, features_per_head // 2],
-        strides=[1, 1, 1, 1],
-    )
-    q_image = fd.ops.slice(
-        q,
-        start_indices=[0, 0, 0, features_per_head // 2],
-        end_indices=[batch_size, num_heads, seq_len, features_per_head],
-        strides=[1, 1, 1, 1],
-    )
-
-    # nvFuser has problems generating negation for bfloat.
-    q_image = fd.ops.cast(q_image, dtype=DataType.Float)
-    q_image = -q_image
-    q_image = fd.ops.cast(q_image, dtype=DataType.BFloat16)
-
-    q_rotated = fd.ops.cat([q_image, q_real], dim=-1)
-
-    cos = fd.ops.broadcast_in_dim(
-        cos, shape=[1, 1, seq_len, features_per_head], broadcast_dims=[2, 3]
-    )
-    sin = fd.ops.broadcast_in_dim(
-        sin, shape=[1, 1, seq_len, features_per_head], broadcast_dims=[2, 3]
-    )
-
-    out = q * cos + q_rotated * sin
-    out = fd.ops.cast(out, DataType.BFloat16)
-    fd.add_output(out)
-
-
-# Idea from @nikitaved: we split and concatenate the embeddings instead of `q`.
-# The embeddings are constant that can be precomputed. So we pay the overhead
-# of split and concatenation only once. The actual forward pass is merely
-# elementwise+reduction surrounded by some meta ops.
-def rope_without_cat_fusion(
-    fd: FusionDefinition,
-    batch_size: int,  # B
-    seq_len: int,  # S
-    num_heads: int,  # H
-    features_per_head: int,  # F
-) -> None:
-    q = fd.define_tensor(
-        shape=[batch_size, seq_len, num_heads, features_per_head],
-        dtype=DataType.BFloat16,
-    )
-    # `cos_sin_matrix` is essentially a batch (of size S*F/2) of 2x2 matrices
-    # laid out in a special way to keep computation simple.
-    #
-    # Using the notations in Figure 1 in https://arxiv.org/pdf/2104.09864.pdf,
-    # cos_sin_matrix[0] contains the following:
-    #
-    #   cos(θ_1),   -sin(θ1)
-    #   cos(θ_2),   -sin(θ2)
-    #   ...
-    #   cos(θ_F/2), -sin(θ_F/2)
-    #   ------------------------
-    #   sin(θ_1),   cos(θ_1)
-    #   sin(θ_2),   cos(θ_2)
-    #   ...
-    #   sin(θ_F/2), cos(θ_F/2)
-    #
-    # cos_sin_matrix[i] is similar but each θ is multiplied by `i+1`.
-    cos_sin_matrix = fd.define_tensor(
-        shape=[seq_len, 2, features_per_head // 2, 2],
-        dtype=DataType.BFloat16,
-    )
-
-    q = fd.ops.reshape(
-        q, new_shape=[batch_size, seq_len, num_heads, 2, features_per_head // 2]
-    )
-    q = fd.ops.permute(q, dims=[0, 2, 1, 4, 3])
-    q = fd.ops.broadcast_in_dim(
-        q,
-        shape=[batch_size, num_heads, seq_len, 1, features_per_head // 2, 2],
-        broadcast_dims=[0, 1, 2, 4, 5],
-    )
-
-    cos_sin_matrix = fd.ops.broadcast_in_dim(
-        cos_sin_matrix,
-        shape=[batch_size, num_heads, seq_len, 2, features_per_head // 2, 2],
-        broadcast_dims=[2, 3, 4, 5],
-    )
-
-    out = fd.ops.sum(q * cos_sin_matrix, [-1])
-    out = fd.ops.cast(out, DataType.BFloat16)
-    out = fd.ops.reshape(
-        out, new_shape=[batch_size, num_heads, seq_len, features_per_head]
-    )
-    fd.add_output(out)
-
-
-@pytest.mark.parametrize("use_cat", [True, False], ids=["with_cat", "without_cat"])
-def test_rope_benchmark(
-    benchmark, use_cat: bool, disable_validation: bool, disable_benchmarking: bool
+from .core import run_benchmark, with_executor, unary_bwd_torch, clear_dynamo_cache
+
+from .rope_ops import rope_setup
+
+
+@pytest.mark.parametrize(
+    "variation",
+    [
+        "llama_2_7b_hf_rope",
+        "llama_3_8B_rope",
+        "hf_qwen2_rope",
+        "hf_phi3_rope",
+        "hf_mistral_nemo_rope",
+    ],
+)
+@pytest.mark.parametrize(
+    "executor", ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
+)
+def test_rope_fwd_benchmark(
+    benchmark,
+    variation: str,
+    executor: str,
 ):
-    batch_size = 32
-    seq_len = 4096
-    num_heads = 32
-    features_per_head = 128
+    kwargs = {}
+    if executor == "thunder":
+        kwargs["nv_enable_matmul"] = True
+    elif executor == "torchcompile":
+        clear_dynamo_cache()
+
+    model, inputs, _, _ = rope_setup[variation]()
+
+    def fwd_call(inp):
+        return model(*inp)
+
+    # Compile the fwd fn for torchcompile
+    benchmark_fn = with_executor(executor, fwd_call, **kwargs)
+    run_benchmark(benchmark, benchmark_fn, inputs())
+
+
+@pytest.mark.parametrize(
+    "variation",
+    [
+        "llama_2_7b_hf_rope",
+        "llama_3_8B_rope",
+        "hf_qwen2_rope",
+        "hf_phi3_rope",
+        "hf_mistral_nemo_rope",
+    ],
+)
+@pytest.mark.parametrize(
+    "executor", ["eager", "torchcompile", "thunder", "thunder-torchcompile"]
+)
+def test_rope_bwd_benchmark(
+    benchmark,
+    variation: str,
+    executor: str,
+):
+    kwargs = {}
+    if executor == "thunder":
+        kwargs["nv_enable_matmul"] = True
+    elif executor == "torchcompile":
+        clear_dynamo_cache()
+
+    model, fwd_inputs, grad, iobytes = rope_setup[variation]()
 
-    # torch.manual_seed(0)
-    q = torch.randn(
-        batch_size,
-        seq_len,
-        num_heads,
-        features_per_head,
-        dtype=torch.bfloat16,
-        device="cuda:0",
-    )
-    freqs = torch.randn(
-        seq_len, features_per_head // 2, dtype=torch.bfloat16, device="cuda:0"
-    )
-    cos = freqs.cos()
-    sin = freqs.sin()
+    def fwd_call(inp):
+        return model(*inp)
 
-    if use_cat:
-        with FusionDefinition() as fd:
-            rope_with_cat_fusion(fd, batch_size, seq_len, num_heads, features_per_head)
-        inputs = [q, torch.cat([cos, cos], dim=-1), torch.cat([sin, sin], dim=-1)]
-    else:
-        with FusionDefinition() as fd:
-            rope_without_cat_fusion(
-                fd, batch_size, seq_len, num_heads, features_per_head
-            )
-        # [S, F/2, 2]
-        cos_and_minus_sin = torch.stack([cos, -sin], dim=-1)
-        # [S, F/2, 2]
-        sin_and_cos = torch.stack([sin, cos], dim=-1)
-        # [S, 2, F/2, 2]
-        cos_sin_matrix = torch.stack([cos_and_minus_sin, sin_and_cos], dim=1)
-        inputs = [q, cos_sin_matrix]
+    # execute the compiled fwd fn
+    fwd_fn = with_executor(executor, fwd_call, **kwargs)
+    outputs = fwd_fn(fwd_inputs())
 
-    if not disable_validation:
-        q_real, q_image = q.permute([0, 2, 1, 3]).split(features_per_head // 2, dim=-1)
-        q_real = q_real.to(torch.float32)
-        q_image = q_image.to(torch.float32)
-        ref_out = torch.cat(
-            [q_real * cos - q_image * sin, q_image * cos + q_real * sin], dim=-1
-        ).to(torch.bfloat16)
-        nvf_out = fd.execute(inputs)
-        torch.testing.assert_close(nvf_out, [ref_out], atol=0, rtol=0)
+    # accumulate all output, so we can feed a single grad and use the unary bwd function
+    output = outputs[0]
+    for i in range(1, len(outputs)):
+        output += outputs[i]
 
-    if not disable_benchmarking:
-        run_benchmark(benchmark, fd.execute, inputs)
+    # NOTE: the iobytes is computed based on how thunder autograd worked. So this is just
+    # a reference point for torchcompile and eager executor for comparison.
+    run_benchmark(benchmark, unary_bwd_torch, [output, grad()], iobytes=iobytes())
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
index 3a5f31c74d5..fe00c1f8105 100644
--- a/csrc/codegen.cpp
+++ b/csrc/codegen.cpp
@@ -158,9 +158,10 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
  public:
   static std::string generateKernelDefinition(
       const kir::Kernel* kernel,
-      const std::string& kernel_name) {
+      const std::string& kernel_name,
+      std::optional<int64_t> num_threads_per_cta) {
     CudaKernelGenerator codegen(kernel);
-    codegen.genDeclaration(kernel_name);
+    codegen.genDeclaration(kernel_name, num_threads_per_cta);
     codegen.startBlock();
     codegen.genPrologue();
     codegen.genBody();
@@ -272,8 +273,18 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
   }
 
   // Generates the kernel function declaration
-  void genDeclaration(const std::string& kernel_name) {
+  void genDeclaration(
+      const std::string& kernel_name,
+      std::optional<int64_t> num_threads_per_cta) {
     code_ << "__global__ void ";
+    if (kernel_->hasManaged("enable_register_sharing") &&
+        kernel_->getManaged<bool>("enable_register_sharing")) {
+      NVF_ERROR(
+          num_threads_per_cta.has_value(),
+          "__launch_bounds__ must be set for register sharing warp specialization");
+      code_ << "__launch_bounds__(/*MAX_THREADS_PER_BLOCK=*/"
+            << num_threads_per_cta.value() << ") ";
+    }
     if (kernel_->hasManaged("cluster_dims")) {
       auto cluster_dims =
           kernel_->getManaged<std::tuple<int64_t, int64_t, int64_t>>(
@@ -3542,9 +3553,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
 
 std::string generateCudaKernel(
     const kir::Kernel* kernel,
-    const std::string& kernel_name) {
+    const std::string& kernel_name,
+    std::optional<int64_t> num_threads_per_cta) {
   FUSER_PERF_SCOPE("generateCudaKernel");
-  return CudaKernelGenerator::generateKernelDefinition(kernel, kernel_name);
+  return CudaKernelGenerator::generateKernelDefinition(
+      kernel, kernel_name, num_threads_per_cta);
 }
 
 } // namespace codegen
diff --git a/csrc/codegen.h b/csrc/codegen.h
index 8c0e89663d1..e2f1382c8d2 100644
--- a/csrc/codegen.h
+++ b/csrc/codegen.h
@@ -19,7 +19,8 @@ namespace codegen {
 //! Generates a CUDA kernel definition for the given kernel
 NVF_API std::string generateCudaKernel(
     const kir::Kernel* kernel,
-    const std::string& kernel_name = "CUDAGeneratedKernel");
+    const std::string& kernel_name = "CUDAGeneratedKernel",
+    std::optional<int64_t> num_threads_per_cta = std::nullopt);
 
 } // namespace codegen
 } // namespace nvfuser
diff --git a/csrc/device_lower/analysis/circular_buffer.cpp b/csrc/device_lower/analysis/circular_buffer.cpp
index 58f35a1f8f0..21f58c8d43d 100644
--- a/csrc/device_lower/analysis/circular_buffer.cpp
+++ b/csrc/device_lower/analysis/circular_buffer.cpp
@@ -143,6 +143,29 @@ void validateCircularBufferedTensor(const TensorView* tv) {
       ". Consumer memory type: ",
       c_mem_type);
 
+  // Ensure that the warp-specialized circular buffer loop is the outer-most
+  // for-loop if register sharing is enabled.
+  if (std::holds_alternative<WarpSpecialized>(
+          tv->circularBufferOptions().type) &&
+      std::get<WarpSpecialized>(tv->circularBufferOptions().type)
+          .num_registers.has_value()) {
+    for (int64_t axis : c10::irange((int64_t)tv->getLoopDomain().size())) {
+      // short-circuit: only check IterDomains to the left of the circular
+      // buffer position
+      if (axis >= circular_buffer_pos) {
+        break;
+      }
+      NVF_ERROR(
+          tv->getLoopDomain().at(axis)->isThread() ||
+              tv->getLoopDomain().at(axis)->isDeviceDim() ||
+              tv->getLoopDomain().at(axis)->isBroadcast() ||
+              tv->getLoopDomain().at(axis)->isOneInt(),
+          "When using register sharing with warp-specialized circular "
+          "buffering, the circular buffer loop must be the outer-most "
+          "for-loop.");
+    }
+  }
+
   return;
 }
 
diff --git a/csrc/device_lower/analysis/device_version.cpp b/csrc/device_lower/analysis/device_version.cpp
index 4682adfaf75..efc0f015914 100644
--- a/csrc/device_lower/analysis/device_version.cpp
+++ b/csrc/device_lower/analysis/device_version.cpp
@@ -69,6 +69,18 @@ void MinimumDeviceVersion::handle(LoadStoreOp* ls_op) {
   }
 }
 
+void MinimumDeviceVersion::handle(TensorView* tv) {
+  bool enable_register_sharing = std::holds_alternative<WarpSpecialized>(
+                                     tv->circularBufferOptions().type) &&
+      std::get<WarpSpecialized>(tv->circularBufferOptions().type)
+          .num_registers.has_value();
+  if (enable_register_sharing) {
+    ensureVersion(
+        {9, 0},
+        "Warp Specialized Circular Buffering uses the setmaxnreg ptx instruction, which requires Hopper (9.0)");
+  }
+}
+
 void MinimumDeviceVersion::ensureVersion(
     std::pair<int, int> version,
     std::string reason) {
diff --git a/csrc/device_lower/analysis/device_version.h b/csrc/device_lower/analysis/device_version.h
index 5fef6b36333..3ebe3a4fa34 100644
--- a/csrc/device_lower/analysis/device_version.h
+++ b/csrc/device_lower/analysis/device_version.h
@@ -48,6 +48,11 @@ class MinimumDeviceVersion : private IterVisitor {
   //! https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
   void handle(LoadStoreOp* ls_op) final;
 
+  //! If TensorView has warp specialized circular buffering, it will use the
+  //! setmaxnreg ptx instruction that requires Hopper (9.0+).
+  //! https://docs.nvidia.com/cuda/parallel-thread-execution/#miscellaneous-instructions-setmaxnreg
+  void handle(TensorView* tv) final;
+
   //! bump min_version_ to at least this value
   void ensureVersion(std::pair<int, int> version, std::string reason);
 
diff --git a/csrc/device_lower/pass/circular_buffer.cpp b/csrc/device_lower/pass/circular_buffer.cpp
index 15ed808b936..84de3c62bf5 100644
--- a/csrc/device_lower/pass/circular_buffer.cpp
+++ b/csrc/device_lower/pass/circular_buffer.cpp
@@ -1394,11 +1394,48 @@ class CircularBufferInserter : private kir::ExprMutator {
                     warp_specialize_on),
                 circular_buffer_loop->fusion()->oneVal()))));
 
+    // Set default value
+    auto& circular_buffer_options =
+        GpuLower::current()->circularBufferInfo().getCircularBufferOptionsFor(
+            circular_buffer_loop->iter_domain());
+    bool enable_register_sharing =
+        std::holds_alternative<WarpSpecialized>(circular_buffer_options.type) &&
+        std::get<WarpSpecialized>(circular_buffer_options.type)
+            .num_registers.has_value();
+
+    GpuLower::current()->kernel()->manage(
+        "enable_register_sharing", enable_register_sharing);
+
+    if (enable_register_sharing) {
+      auto&& [decrease_num_registers, increase_num_registers] =
+          std::get<WarpSpecialized>(circular_buffer_options.type)
+              .num_registers.value();
+
+      // Decrease registers in load warp group
+      kir::SetMaxNReg* dec_reg_load_warp = IrBuilder::create<kir::SetMaxNReg>(
+          IrBuilder::create<Val>(decrease_num_registers, DataType::Index),
+          /*increase_registers=*/false);
+      warp_dispatch_ite->thenBody().push_back(dec_reg_load_warp);
+
+      // Increase registers in compute warp group
+      kir::SetMaxNReg* inc_reg_load_warp = IrBuilder::create<kir::SetMaxNReg>(
+          IrBuilder::create<Val>(increase_num_registers, DataType::Index),
+          /*increase_registers*/ true);
+      warp_dispatch_ite->elseBody().push_back(inc_reg_load_warp);
+    }
+
     // Load loop:
     ForLoop* load_loop = CloneTmaCircularBufferLoopAndInsertSync::clone(
         circular_buffer_loop, loads, CircularBufferLoopStage::LoadWarp);
     warp_dispatch_ite->thenBody().push_back(load_loop);
 
+    if (enable_register_sharing) {
+      // Terminate the warp group handling Load loop immediately after
+      // finishing its work.
+      kir::Return* ret = IrBuilder::create<kir::Return>();
+      warp_dispatch_ite->thenBody().push_back(ret);
+    }
+
     // Prefetch:
     auto prefetch_loop = createArrivesForWar(circular_buffer_loop);
     warp_dispatch_ite->elseBody().push_back(prefetch_loop);
diff --git a/csrc/device_lower/pass/fusion_simplifier.cpp b/csrc/device_lower/pass/fusion_simplifier.cpp
index ed870a586da..03c92914695 100644
--- a/csrc/device_lower/pass/fusion_simplifier.cpp
+++ b/csrc/device_lower/pass/fusion_simplifier.cpp
@@ -56,6 +56,16 @@ class LoadStoreOpInserter : private kir::ExprMutator {
             container, LoadStoreOpType::Set, out, in));
   }
 
+  void handle(RepeatOp* op) final {
+    auto out = op->out();
+    auto in = op->in();
+    auto container = out->container();
+    registerReplaceAndPropagate(
+        op,
+        IrBuilder::createInContainer<LoadStoreOp>(
+            container, LoadStoreOpType::Set, out, in));
+  }
+
   void handle(ViewOp* vop) final {
     auto out = vop->out();
     auto in = vop->in();
diff --git a/csrc/device_lower/pass/insert_syncs.cpp b/csrc/device_lower/pass/insert_syncs.cpp
index 4e2f55323be..ca0892d8036 100644
--- a/csrc/device_lower/pass/insert_syncs.cpp
+++ b/csrc/device_lower/pass/insert_syncs.cpp
@@ -393,11 +393,21 @@ class ReadAfterWriteSyncs : public kir::ExprMutator {
     if (auto mma = dynamic_cast<MmaOp*>(expr)) {
       if (mma->isHopper()) {
         auto scope = scope_.empty() ? nullptr : scope_.back();
+        // Makes sure that writes to operands in the generic proxy are visible
+        // to the async proxy
+        // When wgmma_fence needs to be issued by all warps:
+        // 1) Before the first wgmma.mma_async operation in a warp group.
+        // 2) Between a register access by a thread in the warp group and any
+        //  wgmma.mma_async instruction that accesses the same registers, either
+        //  as accumulator or input register containing fragments of matrix A,
+        //  except when these are accumulator register accesses across multiple
+        //  wgmma.mma_async instructions of the same shape. In the latter case,
+        //  an ordering guarantee is provided by default.
+        auto wgmma_fence = IrBuilder::create<kir::WgMmaFence>();
+        registerInsertBefore(expr, wgmma_fence, scope);
         if (!lower_utils::allMmaInputsGuardedByMBarrier(mma)) {
-          // Makes sure that writes to operands in the generic proxy are visible
-          // to the async proxy
-          auto wgmma_fence = IrBuilder::create<kir::WgMmaFence>();
-          registerInsertBefore(expr, wgmma_fence, scope);
+          // fence.proxy.async makes sure that writes to operands in the generic
+          // proxy are visible to the async proxy
           auto fence_async = IrBuilder::create<kir::FenceAsyncProxy>();
           registerInsertBefore(expr, fence_async, scope);
         }
diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
index 35b825d5348..66baab289db 100644
--- a/csrc/device_lower/utils.cpp
+++ b/csrc/device_lower/utils.cpp
@@ -162,6 +162,7 @@ bool isTvOp(const Expr* expr) {
           BroadcastOp,
           SqueezeOp,
           ExpandOp,
+          RepeatOp,
           ViewAsScalar,
           ViewOp,
           PadOp,
diff --git a/csrc/dispatch.h b/csrc/dispatch.h
index 7681aa878a1..c2a2816fb7e 100644
--- a/csrc/dispatch.h
+++ b/csrc/dispatch.h
@@ -96,6 +96,7 @@ class Val;
   f(BroadcastOp);                 \
   f(SqueezeOp);                   \
   f(ExpandOp);                    \
+  f(RepeatOp);                    \
   f(ViewAsScalar);                \
   f(ViewOp);                      \
   f(CatOp);                       \
@@ -147,6 +148,7 @@ class Val;
 #define DISPATCH_FOR_ALL_HIR_EXPRS(f) \
   f(HostUnit);                        \
   f(PostOnStream);                    \
+  f(LaunchKernel);                    \
   f(SetCurrentStream);                \
   f(GetCurrentStream);                \
   f(Wait);                            \
diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp
index 0d6a2196640..3a577b85796 100644
--- a/csrc/fusion.cpp
+++ b/csrc/fusion.cpp
@@ -419,6 +419,8 @@ std::ostream& Fusion::print(std::ostream& os, bool include_tensor_transforms)
   }
   os << "} // %kernel\n";
 
+  os << std::flush;
+
   return os;
 }
 
@@ -431,6 +433,8 @@ void Fusion::printKernel(const CompileParams& compile_params) {
   GpuLower lower(this, compile_params);
   lower.run();
   debug() << codegen::generateCudaKernel(lower.kernel());
+
+  debug() << std::flush;
 }
 
 std::unordered_map<
@@ -538,6 +542,8 @@ void Fusion::printMath(bool from_outputs_only) {
     debug() << expr;
   }
   debug() << "} // %kernel_math \n\n";
+
+  debug() << std::flush;
 }
 
 std::vector<Val*> Fusion::inputsAndCreated() {
diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index c98543a179a..1c49713eaab 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -3586,6 +3586,194 @@ bool CombineReductions::shouldRun(
   return false;
 }
 
+// This preprocessing attempts to find groups of exprs consist of an
+// up-cast, followed by some ops and ended by a downcast. It is highly
+// likely that such sequences of ops should never be segmented
+// out. This is particularly commonly seen in fusions given by Thunder
+// as it inserts fine-grained downcasting and upcasting ops. Without
+// this preprocessing, a fusion may be segmented right after an
+// up-cast op, for example, and in fact it happened quite frequently
+// in some of the RoPE cases. This preprocessing does not completely
+// avoid such segmentation boundaries, but it should become less
+// likely. See also https://github.com/NVIDIA/Fuser/pull/3699.
+class MergeUpAndDownCast {
+ public:
+  static void run(SegmentCandidateFinder* segment_candidate_finder) {
+    MergeUpAndDownCast group_cast(segment_candidate_finder);
+  }
+
+ private:
+  MergeUpAndDownCast(SegmentCandidateFinder* segment_candidate_finder)
+      : segment_candidate_finder_(segment_candidate_finder) {
+    merge();
+  }
+
+  void merge() {
+    bool merged = true;
+    while (merged) {
+      merged = false;
+      std::unordered_set<SegmentedGroup*> considered_groups;
+
+      for (SegmentedGroup* group : segment_candidate_finder_->groups()) {
+        // If the group is an up-cast group, see if there's a
+        // candidate group starting with the group.
+        if (!isUpCast(group) || considered_groups.count(group)) {
+          continue;
+        }
+
+        auto groups_to_merge = getCandidateCastGroup(group);
+        if (groups_to_merge.size() < 2) {
+          continue;
+        }
+
+        for (auto group : groups_to_merge) {
+          considered_groups.insert(group);
+        }
+
+        // Try merging the detected group
+        if (mergeCastGroup(groups_to_merge)) {
+          merged = true;
+          break;
+        }
+      }
+    }
+  }
+
+  // Try to detect a set of groups that could be merged as a cast
+  // group. The analysis starts with an initial group that solely
+  // consists of an up-cast expression. From the initial group, it
+  // traverses its neighbor groups. If the group is an down-cast group,
+  // it only traverses through the consumer edges. If it's an up-cast
+  // group, it only traverses through the producer edges.
+  //
+  // Additionaly, this traversal has several safeguards to keep the
+  // DAG property intact:
+  //
+  // - For a given group, it does not visit its consumers if it has
+  //   multiple consumers, even if the group is not a down-cast
+  //   group.
+  // - Similarly, it does not visit a producer if the producer has
+  //   multiple cosumers.
+  //
+  // The basic form of this set of groups should look like an up-cast
+  // group, followed by some op groups and ended by a down-cast
+  // group. However, it is not always the case because of the above
+  // safeguards. For example, the following groups would be detected
+  // as a cast group.
+  //
+  // t1 = bf16ToFp32(t0)
+  // t2 = neg(t1)
+  // t3 = sin(t2)
+  // t4 = cos(t2)
+  // t5 = fp32ToBf16(t3)
+  // t6 = fp32ToBf16(t4)
+  //
+  // In this case, t1 and t2 would be detected as a candidate group,
+  // but t3 and t4 would not be included. While we could certainly
+  // extend the analysis, it would need to make sure the DAG property
+  // is not violated.
+  std::vector<SegmentedGroup*> getCandidateCastGroup(
+      SegmentedGroup* initial_group) {
+    std::vector<SegmentedGroup*> groups_to_merge;
+    std::unordered_set<SegmentedGroup*> groups_to_merge_set;
+
+    std::deque<SegmentedGroup*> to_visit;
+    to_visit.push_back(initial_group);
+
+    while (!to_visit.empty()) {
+      SegmentedGroup* group = to_visit.front();
+      to_visit.pop_front();
+
+      if (groups_to_merge_set.count(group)) {
+        continue;
+      }
+
+      // For simplicity, all groups are assumed to be the initial
+      // single-expr groups. Skip if not
+
+      groups_to_merge.push_back(group);
+      groups_to_merge_set.insert(group);
+
+      // Consumer traversal. Stop if this group is a down cast
+      // group. Also stop if there are multiple consumer edges to
+      // simplify keeping the DAG property.
+      if (!isDownCast(group) && group->consumer_edges.size() == 1) {
+        auto consumer_edge = group->consumer_edges.at(0);
+        SegmentedGroup* consumer_group = consumer_edge->to;
+        if (!groups_to_merge_set.count(consumer_group)) {
+          to_visit.push_back(consumer_group);
+        }
+      }
+
+      if (!isUpCast(group)) {
+        for (const auto producer_edge : group->producer_edges) {
+          SegmentedGroup* producer_group = producer_edge->from;
+          // Don't add producers that have more than multiple consumers
+          if (producer_group->consumer_edges.size() > 1) {
+            continue;
+          }
+          if (!groups_to_merge_set.count(producer_group)) {
+            to_visit.push_back(producer_group);
+          }
+        }
+      }
+    }
+
+    return groups_to_merge;
+  }
+
+  // Try merging a candidate cast group. Return true if merged.
+  bool mergeCastGroup(const std::vector<SegmentedGroup*>& groups) {
+    auto sched_type = tryMerge(
+        segment_candidate_finder_->segmented_fusion_.get(),
+        segment_candidate_finder_->runtimeInfo(),
+        groups);
+
+    if (sched_type == SchedulerType::None) {
+      return false;
+    }
+
+    segment_candidate_finder_->mergeAllGivenGroups(groups);
+
+    return true;
+  }
+
+  bool isUpCast(SegmentedGroup* group) const {
+    if (auto precision_bits = getProducerConsumerPrecision(group);
+        precision_bits.has_value()) {
+      return precision_bits->first < precision_bits->second;
+    } else {
+      return false;
+    }
+  }
+
+  bool isDownCast(SegmentedGroup* group) const {
+    if (auto precision_bits = getProducerConsumerPrecision(group);
+        precision_bits.has_value()) {
+      return precision_bits->first > precision_bits->second;
+    } else {
+      return false;
+    }
+  }
+
+  std::optional<std::pair<int64_t, int64_t>> getProducerConsumerPrecision(
+      SegmentedGroup* group) const {
+    if (group->exprs().size() != 1) {
+      return std::nullopt;
+    }
+
+    auto uop = dynamic_cast<UnaryOp*>(group->exprs().front());
+    if (uop == nullptr || uop->getUnaryOpType() != UnaryOpType::Cast) {
+      return std::nullopt;
+    }
+
+    return ir_utils::getPrecisionOfProducerConsumerTensors(uop);
+  }
+
+ private:
+  SegmentCandidateFinder* segment_candidate_finder_ = nullptr;
+};
+
 namespace {
 
 //! Returns true if group1 and group2 are an immediate producer-consumer pair.
@@ -3945,6 +4133,9 @@ void SegmentCandidateFinder::findSegments() {
   removeScalarEdges();
 
   // Run pre-merge heuristics
+  MergeUpAndDownCast::run(this);
+  segmented_fusion_->validateIfDebug(true);
+
   if (options_.run_combine_reductions && CombineReductions::shouldRun(this)) {
     CombineReductions::run(this);
   }
diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
index 59f8ff2d574..c70aab19e49 100644
--- a/csrc/fusion_segmenter.h
+++ b/csrc/fusion_segmenter.h
@@ -488,6 +488,7 @@ class GroupDependencyAnalysis;
 
 // Manual node merging passes
 class CombineReductions;
+class MergeUpAndDownCast;
 
 //! Options to configure/debug candidate finder
 struct SegmentCandidateFinderOptions {
@@ -691,6 +692,7 @@ class SegmentCandidateFinder {
   //!  eventually should have a dedicated interface
   //!  instead of keeping adding friends
   friend class CombineReductions;
+  friend class MergeUpAndDownCast;
 
   //! options to configure and debug the segment process
   SegmentCandidateFinderOptions options_;
diff --git a/csrc/host_ir/container.h b/csrc/host_ir/container.h
index 54b81292d82..aa87785c74f 100644
--- a/csrc/host_ir/container.h
+++ b/csrc/host_ir/container.h
@@ -9,6 +9,7 @@
 
 #include <fusion.h>
 #include <host_ir/host_ir.h>
+#include <runtime/executor.h>
 
 namespace nvfuser {
 
@@ -41,10 +42,19 @@ class HostIrContainer final : public Fusion {
     return top_level_exprs_.push_back(expr);
   }
 
+  void pushBackKernelExecutor(std::unique_ptr<KernelExecutor> ke) {
+    return kernel_executors_.push_back(std::move(ke));
+  }
+
+  KernelExecutor* getKernelExecutor(int64_t index) const {
+    return kernel_executors_.at(index).get();
+  }
+
   Stream* getDefaultStream();
 
  private:
   std::vector<Expr*> top_level_exprs_;
+  std::vector<std::unique_ptr<KernelExecutor>> kernel_executors_;
   Stream* default_stream_ = nullptr;
 };
 
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index 1b2554cdabb..0f9f3da6921 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -201,6 +201,7 @@ HostIrEvaluator::HostIrEvaluator(
       {container_->getDefaultStream(),
        c10::cuda::getDefaultCUDAStream(
            static_cast<c10::DeviceIndex>(device_index))});
+  expr_evaluator_.bind("numberOfStreams", params_.number_of_streams);
 }
 
 std::vector<at::Tensor> HostIrEvaluator::runWithInput(
@@ -297,6 +298,32 @@ void HostIrEvaluator::handle(Synchronize* synchronize) {
   NVFUSER_CUDA_RT_SAFE_CALL(cudaEventDestroy(event));
 }
 
+void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
+  std::vector<c10::IValue> input_IValues;
+  KernelArgumentHolder args =
+      KernelArgumentHolder::createKernelArgumentHolder(input_IValues);
+  for (auto& input : launch_kernel->inputs()) {
+    NVF_ERROR(
+        expr_evaluator_.isKnown(input),
+        "No buffer associated with Val ",
+        input,
+        " for handling ",
+        launch_kernel->toString());
+    PolymorphicValue input_evaluation = expr_evaluator_.evaluate(input);
+    args.push(input_evaluation);
+  }
+
+  // run the compiled kernel
+  std::vector<at::Tensor> outputs =
+      container_->getKernelExecutor(launch_kernel->getIndex())->run(args);
+
+  // Store the outputs in the context
+  for (auto output_idx : c10::irange(outputs.size())) {
+    expr_evaluator_.bind(
+        launch_kernel->outputs().at(output_idx), outputs.at(output_idx));
+  }
+}
+
 void HostIrEvaluator::handle(PostOnStream* post_ir) {
   std::vector<c10::IValue> input_IValues;
   for (auto& input : post_ir->inputs()) {
diff --git a/csrc/host_ir/executor.h b/csrc/host_ir/executor.h
index a51dc32aed4..2797948975a 100644
--- a/csrc/host_ir/executor.h
+++ b/csrc/host_ir/executor.h
@@ -74,6 +74,9 @@ struct HostIrEvaluatorParams {
   // Experimental: whether to cache fusion executor. WAR: avoid recompilation
   // but implicitely assumes that the input shape don't change over iterations
   bool cache_fusion_executor = false;
+  // number of additional cuda streams to use at runtime for comm+compute
+  // pipelining
+  int64_t number_of_streams = 4;
 };
 
 class HostIrEvaluator final : public OptOutDispatch {
@@ -115,6 +118,7 @@ class HostIrEvaluator final : public OptOutDispatch {
   void handle(GetCurrentStream* get_current_stream) override;
   void handle(Synchronize* synchronize) override;
   void handle(PostOnStream* post_ir) override;
+  void handle(LaunchKernel* post_ir) override;
   void handle(Communication* communication) override;
   void handle(P2PCommunication* communication) override;
   void handle(Wait* wait) override;
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index 49b33f59823..c99ddb2f345 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -119,6 +119,36 @@ bool PostOnStream::sameAs(const Statement* other) const {
   return false;
 }
 
+LaunchKernel::LaunchKernel(
+    IrBuilderPasskey passkey,
+    int64_t hic_executor_index,
+    const std::vector<Val*>& inputs,
+    const std::vector<Val*>& outputs)
+    : Expr(passkey, inputs, outputs, {}) {
+  addDataAttribute(hic_executor_index);
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(LaunchKernel)
+
+std::string LaunchKernel::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << "LaunchKernel("
+                          << "Inputs: {";
+  std::for_each(inputs().begin(), inputs().end(), [&ss](auto input) {
+    ss << input->toString(0) << ", ";
+  });
+  ss << "}, Outputs: {";
+  std::for_each(outputs().begin(), outputs().end(), [&ss](auto output) {
+    ss << output->toString(0) << ", ";
+  });
+  ss << "})" << std::endl;
+  return ss.str();
+}
+
+std::string LaunchKernel::toInlineString(int indent_size) const {
+  NVF_CHECK(false, "Can not be printed inline");
+}
+
 Stream::Stream(IrBuilderPasskey passkey, Val* index)
     : Val(passkey, ValType::Stream), index_(index) {}
 
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index 82d67d6f4cc..3ca06779684 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -115,6 +115,35 @@ class PostOnStream : public Expr {
   }
 };
 
+class LaunchKernel : public Expr {
+ public:
+  using Expr::Expr;
+  LaunchKernel(
+      IrBuilderPasskey passkey,
+      int64_t hic_executor_index, // Index into the HostIrContainer's vector of
+                                  // KernelExecutors--i.e., the kernel this IR
+                                  // should launch
+      const std::vector<Val*>& inputs,
+      const std::vector<Val*>& outputs);
+
+  LaunchKernel(const LaunchKernel& other) = delete;
+  LaunchKernel& operator=(const LaunchKernel& other) = delete;
+  LaunchKernel(LaunchKernel&& other) = delete;
+  LaunchKernel& operator=(LaunchKernel&& other) = delete;
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+  const char* getOpString() const override {
+    return "hir::LaunchKernel";
+  }
+
+  int64_t getIndex() const {
+    return attribute<int64_t>(0);
+  }
+};
+
 class Stream : public Val {
  public:
   // if index is provided, the IR represents the streams whose index is the
@@ -208,6 +237,8 @@ class Wait : public Expr {
   }
 };
 
+// Makes the current stream wait on the given stream. Non-blocking from the host
+// point of view.
 class Synchronize : public Expr {
  public:
   using Expr::Expr;
diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index 8e97b958a9a..ea52ba5eeb6 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -14,6 +14,7 @@
 #include <multidevice/device_mesh.h>
 #include <multidevice/utils.h>
 #include <ops/all_ops.h>
+#include <ops/utils.h>
 #include <preseg_passes/insert_reshardings.h>
 #include <preseg_passes/make_resharding_contiguous.h>
 #include <preseg_passes/propagate_shardings.h>
@@ -235,6 +236,10 @@ void lowerToReduceScatter(
 std::vector<Expr*> HostIrLower::lower(Expr* c) {
   FusionGuard fg(c->fusion());
 
+  if (c->isA<MatmulOp>()) {
+    return lowerToCollectiveBasedPipelinedGemmComm(c);
+  }
+
   std::vector<Expr*> comms;
   NVF_ERROR(
       c->inputs().size() == 1 && c->input(0)->isA<TensorView>() &&
@@ -302,16 +307,19 @@ std::vector<Expr*> HostIrLower::lower(Expr* c) {
   return comms;
 }
 
-bool HostIrLower::canLower(Expr* expr) {
+bool HostIrLower::canLower(Expr* expr, bool ignore_inner_resharding) {
   if (!isResharding(expr)) {
     return true;
   }
   if (!ir_utils::isTvOp(expr)) {
     return false;
   }
-  if (expr->isA<ReductionOp>()) {
-    auto in = expr->as<ReductionOp>()->in()->as<TensorView>();
-    auto out = expr->as<ReductionOp>()->out()->as<TensorView>();
+  if (auto* reduction = dynamic_cast<ReductionOp*>(expr)) {
+    if (!ignore_inner_resharding && isInnerResharding(expr)) {
+      return false;
+    }
+    auto in = reduction->in()->as<TensorView>();
+    auto out = reduction->out()->as<TensorView>();
     // get the reduced axis
     std::vector<IterDomain*> reduction_axis;
     std::copy_if(
@@ -328,10 +336,126 @@ bool HostIrLower::canLower(Expr* expr) {
         PairwiseLogicalDomainMap(in, out).mapConsumerToProducer();
     auto c2p_map_it = c2p_map.find(reduction_axis.at(0));
     return c2p_map_it != c2p_map.end() && c2p_map_it->second->isDeviceDim();
-  } else {
-    return expr->isA<LoadStoreOp>() &&
-        (expr->as<LoadStoreOp>()->opType() == LoadStoreOpType::Set);
+  } else if (auto* ldst = dynamic_cast<LoadStoreOp*>(expr)) {
+    if (!ignore_inner_resharding && isInnerResharding(expr)) {
+      return false;
+    }
+    return ldst->as<LoadStoreOp>()->opType() == LoadStoreOpType::Set;
+  } else if (auto* matmul = dynamic_cast<MatmulOp*>(expr)) {
+    // For now we only support c = matmul(a,b) when b,c are fully replicated and
+    // a is sharded on axis 1
+    return !isSharded(matmul->inB()) && !isSharded(matmul->out()) &&
+        matmul->inA()->axis(0)->getParallelType() == ParallelType::Serial &&
+        getShardedLogicalAxis(matmul->inA(), ParallelType::DIDx) == 1 &&
+        matmul->out()->axis(0)->getParallelType() == ParallelType::Stream;
   }
+  return false;
+}
+
+std::vector<Expr*> HostIrLower::lowerToCollectiveBasedPipelinedGemmComm(
+    Expr* expr) {
+  auto matmul = expr->as<MatmulOp>();
+  NVF_ERROR(matmul != nullptr, "Expect a MatmulOp, got", expr);
+  TensorView* tva = matmul->inA();
+  TensorView* tvb = matmul->inB();
+  TensorView* tvc = matmul->out();
+  NVF_ERROR(
+      !isSharded(tvb), "The B operand ", tvb, " is expected to not be sharded");
+  NVF_ERROR(
+      !isSharded(tvc),
+      "The output ",
+      matmul->out(),
+      " is expected to not be sharded");
+  const int64_t sharded_axis_index =
+      getShardedLogicalAxis(tva, ParallelType::DIDx);
+  IterDomain* stream_axis = tva->axis(0);
+  NVF_ERROR(
+      stream_axis->getParallelType() == ParallelType::Serial &&
+          sharded_axis_index == 1,
+      "The operand A ",
+      tva,
+      " is expected to be sharded on the dimension 1");
+
+  auto hic = FusionGuard::getCurFusion()->as<hir::HostIrContainer>();
+
+  auto* get_current_stream = IrBuilder::create<hir::GetCurrentStream>();
+  hir::Stream* original_stream = get_current_stream->stream();
+
+  TensorView* tva_allgathered =
+      ops::newValLike(tva, tva->dtype())->as<TensorView>();
+  tva_allgathered->axis(sharded_axis_index)->parallelize(ParallelType::Serial);
+  tva_allgathered->setMemoryType(MemoryType::Global);
+  auto* allocate_tva_allgathered =
+      IrBuilder::create<kir::Allocate>(tva_allgathered, MemoryType::Global);
+
+  tvc->setMemoryType(MemoryType::Global);
+  auto* allocate_tvc =
+      IrBuilder::create<kir::Allocate>(tvc, MemoryType::Global);
+
+  auto* j =
+      IrBuilder::create<Val>(DataType::Index); // running index of the for-loop
+  auto* start = hic->zeroVal();
+  auto* stop = stream_axis->extent();
+  auto* step = hic->oneVal();
+  auto* for_loop = IrBuilder::create<ForLoop>(
+      stream_axis,
+      /*index=*/j,
+      start,
+      stop,
+      step,
+      /*vectorize=*/false,
+      /*vectorize_shift=*/nullptr,
+      /*unroll_required=*/false,
+      CircularBufferLoopStage::NotApplicable,
+      /*circular_buffer_loop_stage_depth=*/0);
+
+  auto* number_of_streams =
+      IrBuilder::create<NamedScalar>("numberOfStreams", DataType::Int);
+  auto* stream_index = mod(j, number_of_streams);
+  auto* stream = IrBuilder::create<hir::Stream>(stream_index);
+  auto* set_stream = IrBuilder::create<hir::SetCurrentStream>(stream);
+
+  TensorView* tva_j = select(tva, 0, j);
+  TensorView* tva_allgathered_j = select(tva_allgathered, 0, j);
+  TensorView* tvc_j = select(tvc, 0, j);
+
+  NVF_ERROR(
+      tva->hasDeviceMesh(),
+      "The matmul's input ",
+      tva,
+      "is expected to have a DeviceMesh");
+  for (auto tv : {tva_j, tva_allgathered_j, tvc_j}) {
+    tv->setDeviceMesh(tva->getDeviceMesh());
+  }
+
+  auto* communication = IrBuilder::create<Communication>(
+      CommunicationType::Allgather,
+      /*out=*/tva_allgathered_j,
+      /*in=*/tva_j,
+      /*team=*/tva->getDeviceMesh().vector());
+  auto* wait = IrBuilder::create<hir::Wait>(communication);
+
+  auto* mm = IrBuilder::create<MatmulOp>(tvc_j, tva_allgathered_j, tvb);
+
+  auto* set_back_original_stream =
+      IrBuilder::create<hir::SetCurrentStream>(original_stream);
+  auto* sync_stream = IrBuilder::create<hir::Synchronize>(stream);
+
+  std::vector<Expr*> loop_body = {
+      set_stream,
+      tva_j->definition(),
+      tva_allgathered_j->definition(),
+      communication,
+      wait,
+      tvc_j->definition(),
+      mm,
+      set_back_original_stream,
+      sync_stream};
+  for (Expr* expr : loop_body) {
+    for_loop->body().push_back(expr);
+  }
+
+  return {get_current_stream, allocate_tva_allgathered, allocate_tvc, for_loop};
 }
 
 std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
@@ -397,20 +521,20 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
       for (auto* expr :
            HostIrLower::lower(ir_cloner.clone(group->exprs().at(0)))) {
         // Allocate the recv buffers of communications
-        NVF_ERROR(
-            expr->isA<Communication>(),
-            "Expected a Communication but got ",
-            expr);
-        auto* communication = expr->as<Communication>();
-        TensorView* tv = communication->out();
-        if (tv->getDeviceMesh().has(my_device_index)) {
-          auto* allocate =
-              IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
-          hic->pushBackTopLevelExprs(allocate);
+        if (expr->isA<Communication>()) {
+          auto* communication = expr->as<Communication>();
+          TensorView* tv = communication->out();
+          if (tv->getDeviceMesh().has(my_device_index)) {
+            auto* allocate =
+                IrBuilder::create<kir::Allocate>(tv, MemoryType::Global);
+            hic->pushBackTopLevelExprs(allocate);
+          }
+        }
+        hic->pushBackTopLevelExprs(expr);
+        if (expr->isA<Communication>()) {
+          auto wait = IrBuilder::create<hir::Wait>(expr->as<Communication>());
+          hic->pushBackTopLevelExprs(wait);
         }
-        hic->pushBackTopLevelExprs(communication);
-        auto wait = IrBuilder::create<hir::Wait>(communication);
-        hic->pushBackTopLevelExprs(wait);
       }
     } else {
       auto host_unit = IrBuilder::create<hir::HostUnit>(
diff --git a/csrc/host_ir/lower.h b/csrc/host_ir/lower.h
index 6a1d44247d2..02d120cb734 100644
--- a/csrc/host_ir/lower.h
+++ b/csrc/host_ir/lower.h
@@ -16,7 +16,10 @@ namespace nvfuser {
 
 class HostIrLower {
  public:
-  static bool canLower(Expr* expr);
+  // The flag `ignore_inner_resharding` is useful because the preseg passes
+  // `InsertReshardingsPass` and `ReorderShardedAxisPass` want different
+  // behaviors
+  static bool canLower(Expr* expr, bool ignore_inner_resharding = false);
 
   // Lower a sharded Expr into a series of Communication.
   static std::vector<Expr*> lower(Expr* c);
@@ -24,6 +27,9 @@ class HostIrLower {
   static std::unique_ptr<hir::HostIrContainer> lower(
       std::unique_ptr<Fusion> fusion,
       int64_t my_device_index);
+
+ private:
+  static std::vector<Expr*> lowerToCollectiveBasedPipelinedGemmComm(Expr* expr);
 };
 
 } // namespace nvfuser
diff --git a/csrc/id_model/id_model.cpp b/csrc/id_model/id_model.cpp
index 075987d16fe..12f84e2d163 100644
--- a/csrc/id_model/id_model.cpp
+++ b/csrc/id_model/id_model.cpp
@@ -754,7 +754,7 @@ void IdModel::initializeLoopGraph(const StatefulInliningInfo& info) {
   }
 }
 
-ValGraph& IdModel::buildLoopGraph() {
+ValGraph& IdModel::buildLoopGraph(bool force_full_loop_promotion_analysis) {
   // Make sure the depedent graphs are already built
   maybeBuildGraph(IdMappingMode::EXACT);
   maybeBuildGraph(IdMappingMode::PERMISSIVE);
@@ -767,7 +767,10 @@ ValGraph& IdModel::buildLoopGraph() {
   validateLoopGraphHasNoSelfMappedLeafDomains();
 
   loop_promotion_map_ = LoopPromotionMapBuilder::get(
-      *this, inlining_info, loop_promotion_map_builder_callback_);
+      *this,
+      inlining_info,
+      loop_promotion_map_builder_callback_,
+      force_full_loop_promotion_analysis);
 
   // New domains are added. Make sure there's still no self mapping in
   // the loop domains
diff --git a/csrc/id_model/id_model.h b/csrc/id_model/id_model.h
index 3708fa942bf..32c206dda6d 100644
--- a/csrc/id_model/id_model.h
+++ b/csrc/id_model/id_model.h
@@ -198,7 +198,11 @@ class IdModel : public PolymorphicBase {
   // Fills disjoint_ids_[IdMappingMode::LOOP]. Map only inlined
   // domains that are mapped in the permissive graph. Build the Exact
   // and Permissive graphs as well if not yet done.
-  ValGraph& buildLoopGraph();
+  //
+  // (For debugging only) When force_full_loop_promotion_analysis is
+  // true, it always performs the full loop promotion analysis even
+  // when it's possible to take a quicker shortcut.
+  ValGraph& buildLoopGraph(bool force_full_loop_promotion_analysis = false);
 
   // Build a graph. Dependent graphs are also built if not yet done.
   void buildGraph(IdMappingMode mode);
diff --git a/csrc/id_model/indexing.cpp b/csrc/id_model/indexing.cpp
index 8cc0da959e2..567c239053a 100644
--- a/csrc/id_model/indexing.cpp
+++ b/csrc/id_model/indexing.cpp
@@ -938,11 +938,7 @@ IndexingInfo TensorIndexer::computeIndex(
     const std::vector<IterDomain*>& index_ids,
     const std::vector<ForLoop*>& for_loops) const {
   const auto loop_domains = getLoopIds(expr, id_model_);
-
-  const ValGroups loop_groups = traversalGraph().toGroups(loop_domains);
-  const ExprPath<ExprGroup> traversal_path = IndexingTraversal::getExprsBetween(
-      expr, traversalGraph(), loop_domains, index_ids);
-
+  const ExprPath<ExprGroup> traversal_path = getIndexingPath(expr, index_ids);
   const std::unordered_map<ValGroup, Val*> initial_index_map =
       getInitialIndexMap(loop_domains, for_loops);
 
@@ -980,11 +976,17 @@ IndexingInfo TensorIndexer::computeIndex(
     }
   }
 
+  // Fill in broadcast index groups by zero
+  auto index_map = index_compute.indexMap();
+  for (const auto index_id : index_ids) {
+    if (index_id->isBroadcast()) {
+      index_map[traversalGraph().toGroup(index_id)] =
+          index_id->fusion()->zeroVal();
+    }
+  }
+
   IndexingInfo info{
-      loop_domains,
-      traversal_path,
-      index_compute.indexMap(),
-      loop_group_dependencies};
+      loop_domains, traversal_path, index_map, loop_group_dependencies};
   return info;
 }
 
@@ -1248,6 +1250,25 @@ std::vector<PredicateInfo> TensorIndexer::getPredicates(
   return info_vec;
 }
 
+ExprPath<ExprGroup> TensorIndexer::getIndexingPath(
+    const Expr* expr,
+    const std::vector<IterDomain*>& index_ids) const {
+  // Exclude broadcast IDs as their indices should always be zero
+  // and they may not be reachable from the loop domain
+  std::vector<IterDomain*> non_broadcast_index_ids;
+  for (const auto index_id : index_ids) {
+    if (!index_id->isBroadcast()) {
+      non_broadcast_index_ids.push_back(index_id);
+    }
+  }
+
+  return IndexingTraversal::getExprsBetween(
+      expr,
+      traversalGraph(),
+      getLoopIds(expr, id_model_),
+      non_broadcast_index_ids);
+}
+
 std::pair<std::vector<ValGroup>, std::vector<Val*>> TensorIndexer::
     getContigDomainsAndStrides(
         const IndexingAllocationInfo& alloc_info,
diff --git a/csrc/id_model/indexing.h b/csrc/id_model/indexing.h
index 645a8941df3..21d9a931a96 100644
--- a/csrc/id_model/indexing.h
+++ b/csrc/id_model/indexing.h
@@ -118,6 +118,12 @@ class TensorIndexer {
       const std::vector<ForLoop*>& for_loops,
       ForLoop* unswitched_loop = nullptr) const;
 
+  // Get the indexing traversal path for indexing a given list of IDs
+  // for a given expr
+  ExprPath<ExprGroup> getIndexingPath(
+      const Expr* expr,
+      const std::vector<IterDomain*>& index_ids) const;
+
  private:
   // Build a map of loop groups to their index Vals. See the comment
   // on loop_index_map_.
diff --git a/csrc/id_model/loop_promotion.cpp b/csrc/id_model/loop_promotion.cpp
index 1c055943eda..08ae225e6eb 100644
--- a/csrc/id_model/loop_promotion.cpp
+++ b/csrc/id_model/loop_promotion.cpp
@@ -17,8 +17,12 @@ namespace nvfuser {
 LoopPromotionMapBuilder::LoopPromotionMapBuilder(
     IdModel& id_model,
     const StatefulInliningInfo& inlining_info,
-    LoopPromotionMapBuilderCallback* callback)
-    : id_model_(id_model), inlining_info_(inlining_info), callback_(callback) {}
+    LoopPromotionMapBuilderCallback* callback,
+    bool force_full_loop_promotion_analysis)
+    : id_model_(id_model),
+      inlining_info_(inlining_info),
+      callback_(callback),
+      force_full_loop_promotion_analysis_(force_full_loop_promotion_analysis) {}
 
 ValGraph& LoopPromotionMapBuilder::idGraph(IdMappingMode mode) {
   return id_model_.idGraph(mode);
@@ -97,11 +101,12 @@ std::unordered_map<ValGroup, IterDomain*> LoopPromotionMapBuilder::
 
 namespace {
 
-// Check if all the domains of each loop group are exactly mapped. If
-// so, the full promotion analysis should not be necessary. Only the
+// Check if each loop group has at most one group of concrete domains. If
+// so, the full promotion analysis should not be necessary since
+// finding the promotion ID is a trivial probelm. Only the
 // loop groups of the loop domains need to be checked as loop
 // promotion does not matter for the other domains.
-bool isLoopGraphUniform(const IdModel& id_model) {
+bool isLoopGraphAlmostUniform(const IdModel& id_model) {
   for (const auto tv : id_model.tvs()) {
     if (tv->isFusionInput()) {
       continue;
@@ -111,8 +116,22 @@ bool isLoopGraphUniform(const IdModel& id_model) {
           id_model.idGraph(IdMappingMode::LOOP).toGroup(loop_id);
       const auto all_exact_groups =
           id_model.idGraph(IdMappingMode::EXACT).toGroups(*loop_group);
-      if (all_exact_groups.size() > 1) {
-        return false;
+      if (all_exact_groups.size() == 1) {
+        continue;
+      }
+
+      // Even when multiple exact groups are found, if there's only
+      // one concrete group and all the others are broadcast, it's
+      // obvious that the concrete group represents the promotion.
+      bool concrete_group_found = false;
+      for (const auto& exact_group : all_exact_groups) {
+        if (!exact_group->front()->as<IterDomain>()->isBroadcast()) {
+          if (concrete_group_found) {
+            // multiple concrete groups
+            return false;
+          }
+          concrete_group_found = true;
+        }
       }
     }
   }
@@ -126,8 +145,9 @@ std::unordered_map<ValGroup, IterDomain*> LoopPromotionMapBuilder::build() {
   // Some quick shortcut conditions to skip the full loop promotion
   // analysis. These are not comprehensive. Should add more conditions
   // if necessary.
-  if (inlining_info_.p2c_root_broadcast_resolution_map.empty() ||
-      isLoopGraphUniform(id_model_)) {
+  if (!force_full_loop_promotion_analysis_ &&
+      (inlining_info_.p2c_root_broadcast_resolution_map.empty() ||
+       isLoopGraphAlmostUniform(id_model_))) {
     return buildWithNoBroadcast();
   }
 
@@ -936,8 +956,10 @@ void LoopPromotionMapBuilder::sanityCheckLoopPromotionMap(
 std::unordered_map<ValGroup, IterDomain*> LoopPromotionMapBuilder::get(
     IdModel& id_model,
     const StatefulInliningInfo& inlining_info,
-    LoopPromotionMapBuilderCallback* callback) {
-  LoopPromotionMapBuilder builder(id_model, inlining_info, callback);
+    LoopPromotionMapBuilderCallback* callback,
+    bool force_full_loop_promotion_analysis) {
+  LoopPromotionMapBuilder builder(
+      id_model, inlining_info, callback, force_full_loop_promotion_analysis);
   return builder.build();
 }
 
@@ -967,14 +989,21 @@ std::unordered_map<ValGroup, IterDomain*> LoopPromotionMapBuilder::
           (int64_t)StmtSort::getExprsTo({loop_id->extent()}).size();
       auto this_is_const = loop_id->extent()->isConstInt();
 
-      // First ID
-      if (promotion == nullptr) {
+      // A group is allowed to have one single exact group of concrete
+      // IDs with a broadcast group.
+      if (promotion == nullptr ||
+          (promotion->isBroadcast() && !loop_id->isBroadcast())) {
         is_const = this_is_const;
         promotion = loop_id;
         num_exprs = this_num_exprs;
         continue;
       }
 
+      // Ignore broadcast if a concrete ID is already found
+      if (!promotion->isBroadcast() && loop_id->isBroadcast()) {
+        continue;
+      }
+
       // If new ID is non-const while the current promotion is const,
       // or if both IDs are const or non-const and the number of
       // expressions is not smaller, keep the current promotion
diff --git a/csrc/id_model/loop_promotion.h b/csrc/id_model/loop_promotion.h
index 88ff26a5d6f..1c6aa486c97 100644
--- a/csrc/id_model/loop_promotion.h
+++ b/csrc/id_model/loop_promotion.h
@@ -48,16 +48,22 @@ class LoopPromotionMapBuilder {
   // Build a map of loop groups to IterDomains that represent actual
   // loops. The map is built based on the broadcast resolution with
   // root domains between inlined producer and consumer tensors.
+  //
+  // (For debugging only) When force_full_loop_promotion_analysis is
+  // true, it always performs the full loop promotion analysis even
+  // when it's possible to take a quicker shortcut.
   static std::unordered_map<ValGroup, IterDomain*> get(
       IdModel& id_model,
       const StatefulInliningInfo& inlining_info,
-      LoopPromotionMapBuilderCallback* callback = nullptr);
+      LoopPromotionMapBuilderCallback* callback = nullptr,
+      bool force_full_loop_promotion_analysis = false);
 
  private:
   LoopPromotionMapBuilder(
       IdModel& id_model,
       const StatefulInliningInfo& inlining_info,
-      LoopPromotionMapBuilderCallback* callback = nullptr);
+      LoopPromotionMapBuilderCallback* callback = nullptr,
+      bool force_full_loop_promotion_analysis = false);
 
   std::unordered_map<ValGroup, IterDomain*> build();
 
@@ -164,6 +170,11 @@ class LoopPromotionMapBuilder {
   IdModel& id_model_;
   const StatefulInliningInfo& inlining_info_;
   LoopPromotionMapBuilderCallback* callback_ = nullptr;
+
+  // (For debugging only) When force_full_loop_promotion_analysis_ is
+  // true, it always performs the full loop promotion analysis even
+  // when it's possible to take a quicker shortcut.
+  bool force_full_loop_promotion_analysis_ = false;
 };
 
 } // namespace nvfuser
diff --git a/csrc/id_model/predicate_indexing.cpp b/csrc/id_model/predicate_indexing.cpp
index 15edb366357..1b7733bedaf 100644
--- a/csrc/id_model/predicate_indexing.cpp
+++ b/csrc/id_model/predicate_indexing.cpp
@@ -26,7 +26,7 @@ std::vector<IterDomain*> getPredicateDomains(
       : consumer_tv->getLogicalDomain();
 
   // Broadcast domains should not need to be predicated. Note that
-  // unlike indexing for TensorIndex, reduction doamins do need to be
+  // unlike indexing for TensorIndex, reduction domains do need to be
   // indexed to guard the access to the producer tensor
   predicate_domains.erase(
       std::remove_if(
diff --git a/csrc/instrumentation.h b/csrc/instrumentation.h
index 6cf6b598d2f..865896f5a44 100644
--- a/csrc/instrumentation.h
+++ b/csrc/instrumentation.h
@@ -10,7 +10,7 @@
 #include <exceptions.h>
 #include <utils.h>
 
-#include <nvToolsExt.h>
+#include <nvtx3/nvToolsExt.h>
 
 // NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdio.h>
diff --git a/csrc/ir/interface_nodes.h b/csrc/ir/interface_nodes.h
index 4e53a6207cc..98236fd3c5f 100644
--- a/csrc/ir/interface_nodes.h
+++ b/csrc/ir/interface_nodes.h
@@ -227,11 +227,41 @@ inline std::ostream& operator<<(std::ostream& os, const Pipelined& pipelined) {
 }
 
 struct WarpSpecialized {
-  ParallelType on;
-  explicit WarpSpecialized(ParallelType on) : on(on) {}
+  ParallelType on = ParallelType::Serial;
+  // The number of registers for load and compute warps respectively.
+  std::optional<std::pair<int64_t, int64_t>> num_registers = std::nullopt;
+
+  explicit WarpSpecialized(
+      ParallelType on,
+      std::pair<int64_t, int64_t> num_registers)
+      : on(on), num_registers(num_registers) {
+    validateRegisterSharing();
+  }
+  explicit WarpSpecialized(ParallelType on)
+      : on(on), num_registers(std::nullopt) {}
   WarpSpecialized() = default;
+
+  void validateRegisterSharing() {
+    // short-circuit: register sharing is not used.
+    if (!num_registers.has_value()) {
+      return;
+    }
+    auto validate_num_registers = [](int64_t a) {
+      NVF_ERROR(
+          a >= 24 && a <= 256 && a % 8 == 0,
+          "The number of registers for setmaxnreg must be between 24 and",
+          " 256 (inclusive) and be a multiple of 8.");
+    };
+    validate_num_registers(num_registers.value().first);
+    validate_num_registers(num_registers.value().second);
+    NVF_ERROR(
+        num_registers.value().first <= num_registers.value().second,
+        "The number of registers for load warp group must be <= to the number",
+        " of registers for the compute warp groups.");
+  }
+
   bool operator==(const WarpSpecialized& other) const {
-    return on == other.on;
+    return on == other.on && num_registers == other.num_registers;
   }
 };
 
@@ -252,7 +282,15 @@ inline std::ostream& operator<<(
     default:
       NVF_THROW("Invalid parallel type");
   }
-  return os << "WarpSpecializedOn" << parallel_type_str;
+  std::string num_registers = "RegisterSharing_None";
+  if (warp_specialized.num_registers.has_value()) {
+    auto&& [decrease_num_reg, increase_num_reg] =
+        warp_specialized.num_registers.value();
+    std::stringstream s;
+    s << "RegisterSharing_" << decrease_num_reg << "_" << increase_num_reg;
+    num_registers = s.str();
+  }
+  return os << "WarpSpecializedOn" << parallel_type_str << num_registers;
 }
 
 using CircularBufferType = std::variant<Pipelined, WarpSpecialized>;
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
index df9b0bf50c9..6aebcb3c457 100644
--- a/csrc/ir/internal_nodes.h
+++ b/csrc/ir/internal_nodes.h
@@ -1527,6 +1527,42 @@ class ExpandOp : public Expr {
       const std::vector<PolymorphicValue>& inputs) const override;
 };
 
+// Represents a repetition of broadcast IDs. Repetitions of
+// non-broadcast IDs are represented using the broadcast, expand and
+// reshape pattern. See the repeat op implementation in ops/alias.cpp
+// as well as the TranslateRepeatToExpand preseg pass.
+class RepeatOp : public Expr {
+ public:
+  using Expr::Expr;
+
+  // in: Input tensor that have broadcast logical IDs.
+  // out: Output tensor where some of the input broadcast logical IDs
+  // are converted to concrete IDs. Their extents represent the
+  // repetition factor of each ID.
+  RepeatOp(IrBuilderPasskey, TensorView* out, TensorView* in);
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  const char* getOpString() const override {
+    return "RepeatOp";
+  }
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+
+  TensorView* out() const {
+    return output(0)->as<TensorView>();
+  }
+
+  TensorView* in() const {
+    return input(0)->as<TensorView>();
+  }
+
+  std::vector<PolymorphicValue> evaluate(
+      const ExpressionEvaluator& ee,
+      const std::vector<PolymorphicValue>& inputs) const override;
+};
+
 class ViewAsScalar : public Expr {
  public:
   using Expr::Expr;
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index 5f0528991b2..9b0f09b5c25 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -2135,6 +2135,86 @@ std::vector<PolymorphicValue> ExpandOp::evaluate(
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(ExpandOp)
 
+RepeatOp::RepeatOp(IrBuilderPasskey passkey, TensorView* out, TensorView* in)
+    : Expr(passkey) {
+  auto in_domain = TensorDomain::noReductions(in->getLogicalDomain());
+  const auto& out_domain = out->getLogicalDomain();
+
+  NVF_ERROR(in_domain.size() == out_domain.size());
+
+  NVF_ERROR(
+      std::none_of(
+          out->getLogicalDomain().begin(),
+          out->getLogicalDomain().end(),
+          [](IterDomain* out_logical_id) {
+            return out_logical_id->isReduction();
+          }),
+      "Output should not have reduction IDs.");
+
+  bool repetition_found = false;
+  for (const auto i : c10::irange(in_domain.size())) {
+    if (in_domain.at(i)->isBroadcast() && !out_domain.at(i)->isBroadcast()) {
+      NVF_ERROR(!in_domain.at(i)->hasExpandedExtent());
+      NVF_ERROR(in_domain.at(i)->extent()->isOneInt());
+      repetition_found = true;
+    }
+  }
+
+  NVF_ERROR(
+      repetition_found,
+      "No repetition dim found: ",
+      out->toString(),
+      ", ",
+      in->toString());
+
+  addOutput(out);
+  addInput(in);
+}
+
+std::string RepeatOp::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << out()->toString() << " = repeat( " << in()
+                          << " )\n";
+  return ss.str();
+}
+
+std::string RepeatOp::toInlineString(int indent_size) const {
+  NVF_CHECK(false, "Tensor op can not be printed inline");
+}
+
+std::vector<PolymorphicValue> RepeatOp::evaluate(
+    const ExpressionEvaluator& ee,
+    const std::vector<PolymorphicValue>& inputs) const {
+  NVF_ERROR(
+      inputs.size() == 1,
+      "RepeatOp expects exactly 1 input, but received ",
+      inputs.size());
+  auto tensor = inputs.at(0).as<at::Tensor>();
+  std::vector<int64_t> multipliers;
+  multipliers.reserve(out()->getLogicalDomain().size());
+  const auto c2p =
+      PairwiseLogicalDomainMap(in(), out()).mapConsumerToProducer();
+  for (const auto i : c10::irange(out()->getLogicalDomain().size())) {
+    auto out_id = out()->getLogicalDomain().at(i);
+    auto inp_id = c2p.at(out_id);
+    auto out_extent = ee.evaluate(out_id->extent()).as<int64_t>();
+    auto inp_extent = ee.evaluate(inp_id->extent()).as<int64_t>();
+    NVF_ERROR(
+        out_extent % inp_extent == 0,
+        "For dimension ",
+        i,
+        ", the output extent (",
+        out_extent,
+        " should be a multiple of the input extent (",
+        inp_extent,
+        ").");
+    multipliers.push_back(out_extent / inp_extent);
+  }
+  return {tensor.repeat(multipliers)};
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(RepeatOp)
+
 ViewAsScalar::ViewAsScalar(
     IrBuilderPasskey passkey,
     Val* out,
@@ -3705,6 +3785,7 @@ TensorDomain* TensorDomain::flatten(int64_t start_dim, int64_t end_dim) {
                           (is_rfactor_dim && inp_id->isBroadcast())
                               ? IterType::Iteration
                               : inp_id->getIterType())
+                      .expanded_extent(nullptr)
                       .build();
     new_root_domain.push_back(out_id);
   }
diff --git a/csrc/ir/utils.cpp b/csrc/ir/utils.cpp
index 4de5d7c8097..107ab898453 100644
--- a/csrc/ir/utils.cpp
+++ b/csrc/ir/utils.cpp
@@ -1524,4 +1524,32 @@ std::vector<IterDomain*> strideOrderToAllocation(
   return allocation_domain;
 }
 
+std::optional<std::pair<int64_t, int64_t>> getPrecisionOfProducerConsumerTensors(
+    UnaryOp* uop) {
+  NVF_CHECK(
+      uop != nullptr && uop->getUnaryOpType() == UnaryOpType::Cast,
+      "Invalid expr: ",
+      uop->toString());
+
+  auto inp_tv = ir_utils::getTvInput(uop);
+  auto out_tv = ir_utils::getTvOutput(uop);
+  if (inp_tv == nullptr || out_tv == nullptr) {
+    return std::nullopt;
+  }
+
+  auto inp_dtype = inp_tv->dtype().type;
+  auto out_dtype = out_tv->dtype().type;
+  auto inp_prim_type = std::get_if<PrimDataType>(&inp_dtype);
+  auto out_prim_type = std::get_if<PrimDataType>(&out_dtype);
+
+  if (inp_prim_type == nullptr || out_prim_type == nullptr ||
+      *inp_prim_type == PrimDataType::Index ||
+      *out_prim_type == PrimDataType::Index) {
+    return std::nullopt;
+  }
+
+  return std::make_pair(
+      primDataTypeSize(*inp_prim_type), primDataTypeSize(*out_prim_type));
+}
+
 } // namespace nvfuser::ir_utils
diff --git a/csrc/ir/utils.h b/csrc/ir/utils.h
index 4ac93824037..37b53a8df36 100644
--- a/csrc/ir/utils.h
+++ b/csrc/ir/utils.h
@@ -803,4 +803,9 @@ std::vector<IterDomain*> strideOrderToAllocation(
     const std::vector<IterDomain*>& logical_domain,
     const std::vector<int64_t>& stride_order);
 
+// Returns the number of bytes of data types of the producer and
+// consumer tensors of a cast unary op
+std::optional<std::pair<int64_t, int64_t>> getPrecisionOfProducerConsumerTensors(
+    UnaryOp* cast_op);
+
 } // namespace nvfuser::ir_utils
diff --git a/csrc/logical_domain_map.h b/csrc/logical_domain_map.h
index d00e070df28..0a2d9d3ca01 100644
--- a/csrc/logical_domain_map.h
+++ b/csrc/logical_domain_map.h
@@ -504,6 +504,10 @@ class ComputeAtLogicalDomainMapBuilder : private BackwardVisitor {
     mapPointwiseLikeOp(op);
   }
 
+  void handle(RepeatOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+
   void handle(PadOp* op) override {
     // For compute-at, padded id should be mapped
     mapPointwiseLikeOp(op);
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 847557bfa3a..771bf19d6f8 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -482,6 +482,47 @@ void shardAllLike(TensorView* ref, std::vector<TensorView*> tvs) {
     scheduler_utils::parallelizeAllLike(
         ref, tvs, {ParallelType::DIDx, ParallelType::Serial});
   }
+
+  // parallelAllLke, tries to DID-parallelize
+  // reduction dimensions. For example,
+  //
+  //   [iDID{i1}, i2] -> (Reduce) -> [r{i1}, i2] -> (Pointwise) -> [i2]
+  //
+  // becomes
+  //
+  //   [iDID{i1}, i2] -> (Reduce) -> [rDID{i1}, i2] -> (Pointwise) -> [i2]
+  //
+  // This implies that the reduction result only exists on the "home" device.
+  // `lower_communication` can't lower such a reduction today. lowerToReduce
+  // is closest but it uses the output device mesh to indicate the home device.
+  // Also, an extra broadcast will be needed to replicate the reduction result
+  // to all devices for the pointwise op.
+  //
+  // Therefore, instead, we remove the DID from reduction dimensions and
+  // therefore reset them to Serial. This way,
+  // the above becomes
+  //
+  //   [iDID{i1}, i2] -> (Reduce) -> [r{i1}, i2] -> (Pointwise) -> [i2]
+  //
+  // where the reduction will be lowered to an Allreduce.
+  //
+  // Alternatively, @naoyam proposed to represent an allreduce as a reduce
+  // followed by a broadcasting set.
+  //
+  //   [iDID{i1}, i2] -> (Reduce) -> [rDID{i1}, i2] -> (Set) [i2] -> (Pointwise)
+  //   -> [i2]
+  //
+  // This will make the semantics similar to other parallel types and therefore
+  // we can better leverage existing parallelization utilities. We have yet to
+  // pursue this because of implementation difficulty -- `lower_communication`
+  // would need to match the reduce-set pattern.
+  for (TensorView* tv : tvs) {
+    for (IterDomain* id : tv->getLoopDomain()) {
+      if (id->isReduction() && id->isDeviceDim()) {
+        id->parallelize(ParallelType::Serial);
+      }
+    }
+  }
 }
 
 void shardBetween(
diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index bc541623310..5729fed5b3f 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -1124,4 +1124,94 @@ TensorView* expand_as(TensorView* inp, TensorView* other) {
   return out_tensor;
 }
 
+TensorView* repeat(
+    TensorView* inp_tv,
+    const std::vector<int64_t>& repeat_times) {
+  const auto ndims =
+      TensorDomain::noReductions(inp_tv->getLogicalDomain()).size();
+
+  // Handle repetitions of non-broadcast IDs first. Each ID is
+  // individully repeated by:
+  //
+  // Step 1. Insert a broadcast ID immediately outside of the
+  // repeated ID
+  // Step 2. Expand the broadcast ID by the repetition factor
+  // Step 3. Flatten the expanded ID and the repeated ID
+  //
+  // Note that it's also possible to repeat multiple non-broadcast IDs
+  // once by inserting and expanding broadcast IDs by one BroadcastOp
+  // and one ExpandOp.
+
+  bool has_repetition_of_broadcast = false;
+  auto intermediate_tv = inp_tv;
+  for (const auto i : c10::irange(ndims)) {
+    if (repeat_times.at(i) == 1) {
+      continue;
+    }
+
+    auto inp_id = intermediate_tv->getLogicalDomain().at(i);
+
+    // Broadcast is handled after this
+    if (inp_id->isBroadcast()) {
+      has_repetition_of_broadcast = true;
+      continue;
+    }
+
+    // Step 1: Insert a broadcast ID
+    std::vector<bool> bcast_flags(ndims + 1, false);
+    bcast_flags.at(i) = true;
+    auto broadcast_tv = broadcast(intermediate_tv, bcast_flags);
+
+    // Step 2: Expand the broadcast ID for the repetition factor
+    std::vector<Val*> expanded_sizes(
+        bcast_flags.size(), IrBuilder::create<Val>(-1L));
+    expanded_sizes.at(i) = IrBuilder::create<Val>(repeat_times.at(i));
+    auto expanded_tv = expand(broadcast_tv, expanded_sizes);
+
+    // Step 3: Reshape to merge the broadcast ID and the repeated ID
+    intermediate_tv = flatten(expanded_tv, (int64_t)i, (int64_t)i + 1);
+  }
+
+  if (!has_repetition_of_broadcast) {
+    return intermediate_tv;
+  }
+
+  // Repeat broadcast IDs. The expand approach doesn't work as reshape
+  // would just squeeze repeated IDs and thus there would be no
+  // merge. Expanded IDs would remain to be expanded broadcast IDs. To
+  // concretize them, use RepeatOp
+  std::vector<IterDomain*> new_domain;
+  new_domain.reserve(ndims);
+  std::vector<std::optional<bool>> new_contiguity;
+  new_contiguity.reserve(ndims);
+
+  for (const auto i : c10::irange(ndims)) {
+    auto inp_id = intermediate_tv->getLogicalDomain().at(i);
+    IterDomain* new_id = nullptr;
+
+    if (repeat_times.at(i) > 1 && inp_id->isBroadcast()) {
+      new_id = IterDomainBuilder(inp_id)
+                   .extent(IrBuilder::create<Val>(
+                       repeat_times.at(i), DataType::Index))
+                   .iter_type(IterType::Iteration)
+                   .build();
+    } else {
+      new_id = inp_id->cloneWithoutRFactor();
+    }
+
+    new_domain.push_back(new_id);
+    new_contiguity.push_back(
+        new_id->isBroadcast() ? std::optional<bool>(std::nullopt)
+                              : std::optional<bool>(true));
+  }
+
+  auto out_tv = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(new_domain, new_contiguity),
+      inp_tv->dtype());
+
+  IrBuilder::create<RepeatOp>(out_tv, intermediate_tv);
+
+  return out_tv;
+}
+
 } // namespace nvfuser
diff --git a/csrc/ops/alias.h b/csrc/ops/alias.h
index 1b6d443c38c..8a896dba1d6 100644
--- a/csrc/ops/alias.h
+++ b/csrc/ops/alias.h
@@ -182,4 +182,11 @@ NVF_API TensorView* expand(
 // non broadcasted iter domain, inp will be expanded to other's size.
 NVF_API TensorView* expand_as(TensorView* inp, TensorView* other);
 
+// Repeat each dimension for a given time. The repeat_times parameter
+// must have the same number of elements as the dimensionality of the
+// input tensor (excluding reduction IDs).
+NVF_API TensorView* repeat(
+    TensorView* inp,
+    const std::vector<int64_t>& repeat_times);
+
 } // namespace nvfuser
diff --git a/csrc/ops/composite.cpp b/csrc/ops/composite.cpp
index d2f0d9277d2..ed8986ff817 100644
--- a/csrc/ops/composite.cpp
+++ b/csrc/ops/composite.cpp
@@ -80,9 +80,9 @@ TensorView* triu(TensorView* tv, Val* offset) {
 
   NVF_CHECK(
       dims >= 2,
-      "triu is only supported for 2+D tensors, but got ",
+      "input tensor for triu must have 2 or more dims, but got ",
       dims,
-      "D tensor");
+      " dims");
 
   auto fusion = tv->fusion();
 
diff --git a/csrc/preseg_passes/insert_reshardings.cpp b/csrc/preseg_passes/insert_reshardings.cpp
index 9d62e0dc1a9..c634616d1d6 100644
--- a/csrc/preseg_passes/insert_reshardings.cpp
+++ b/csrc/preseg_passes/insert_reshardings.cpp
@@ -33,7 +33,8 @@ void insertReshardingsBefore(Fusion* fusion) {
   // Remove this after we refactor this as a pre-segmenter pass.
   FusionGuard fg(fusion);
   for (Expr* expr : fusion->exprs()) {
-    if (HostIrLower::canLower(expr) || shouldReshardAfter(expr)) {
+    if (HostIrLower::canLower(expr, /*ignore_inner_resharding=*/true) ||
+        shouldReshardAfter(expr)) {
       continue;
     }
 
@@ -85,7 +86,8 @@ void insertReshardingsAfter(Fusion* fusion) {
   auto exprs = fusion->exprs();
   for (auto it = std::rbegin(exprs); it != std::rend(exprs); it++) {
     Expr* expr = *it;
-    if (HostIrLower::canLower(expr) || !shouldReshardAfter(expr)) {
+    if (HostIrLower::canLower(expr, /*ignore_inner_resharding=*/true) ||
+        !shouldReshardAfter(expr)) {
       continue;
     }
 
diff --git a/csrc/preseg_passes/pre_segmenter.cpp b/csrc/preseg_passes/pre_segmenter.cpp
index b4943f1c91e..76da769835f 100644
--- a/csrc/preseg_passes/pre_segmenter.cpp
+++ b/csrc/preseg_passes/pre_segmenter.cpp
@@ -58,8 +58,19 @@ namespace nvfuser::preseg_passes {
   //    avoid moving pad operatoins around, which could disturb the analysis
   //    from MarkAliasPrepare
   // 2. after MoveSplitCat
-  //    to avoid this pass moving PadOp around to break the MoveSplitCat.
-  OptimizationPass<MovePadPass>::runPass(fusion);
+  //    to avoid this pass moving PadOp around to break the
+  // MoveSplitCat.
+  //
+  // Moving a pad backward means all preceding operations would be
+  // executed for the whole padded region too. Since the resize
+  // scheduler does not have the issue, let it take care of padding
+  // whenever enabled. Note that even when it is enabled, it is
+  // currently only limited to pointwise patterns and does not
+  // support, for example, reductions, etc, so this preseg pass still
+  // may be preferable in some cases.
+  if (!isOptionEnabled(EnableOption::ResizeScheduler)) {
+    OptimizationPass<MovePadPass>::runPass(fusion);
+  }
   // NOTE vvv this doesn't really work, since our type promotion to higher
   // precision for Add cannot be canceled out with previous cast to lower
   // precision. Since it's not an no-op and it has a quantization effect. I'll
diff --git a/csrc/preseg_passes/propagate_shardings.cpp b/csrc/preseg_passes/propagate_shardings.cpp
index 3f8e514184e..69ba5983060 100644
--- a/csrc/preseg_passes/propagate_shardings.cpp
+++ b/csrc/preseg_passes/propagate_shardings.cpp
@@ -98,47 +98,6 @@ void PropagateShardingsPass::runPass(Fusion* fusion) {
     shardAllLike(ref_input, outputs_without_mesh);
   }
 
-  // shardAllLike, which calls parallelAllLke, tries to DID-parallelize
-  // reduction dimensions. For example,
-  //
-  //   [iDID{i1}, i2] -> (Reduce) -> [r{i1}, i2] -> (Pointwise) -> [i2]
-  //
-  // becomes
-  //
-  //   [iDID{i1}, i2] -> (Reduce) -> [rDID{i1}, i2] -> (Pointwise) -> [i2]
-  //
-  // This implies that the reduction result only exists on the "home" device.
-  // `lower_communication` can't lower such a reduction today. lowerToReduce
-  // is closest but it uses the output device mesh to indicate the home device.
-  // Also, an extra broadcast will be needed to replicate the reduction result
-  // to all devices for the pointwise op.
-  //
-  // Therefore, instead, we remove the DID from reduction dimensions and
-  // therefore reset them to Serial. This way,
-  // the above becomes
-  //
-  //   [iDID{i1}, i2] -> (Reduce) -> [r{i1}, i2] -> (Pointwise) -> [i2]
-  //
-  // where the reduction will be lowered to an Allreduce.
-  //
-  // Alternatively, @naoyam proposed to represent an allreduce as a reduce
-  // followed by a broadcasting set.
-  //
-  //   [iDID{i1}, i2] -> (Reduce) -> [rDID{i1}, i2] -> (Set) [i2] -> (Pointwise)
-  //   -> [i2]
-  //
-  // This will make the semantics similar to other parallel types and therefore
-  // we can better leverage existing parallelization utilities. We have yet to
-  // pursue this because of implementation difficulty -- `lower_communication`
-  // would need to match the reduce-set pattern.
-  for (TensorView* tv : fusion->allTvs()) {
-    for (IterDomain* id : tv->getLoopDomain()) {
-      if (id->isReduction() && id->isDeviceDim()) {
-        id->parallelize(ParallelType::Serial);
-      }
-    }
-  }
-
   // Back-propagate device meshes. This makes sure all TensorViews have a mesh
   // if any of them has one. This is needed in addition to the forward
   // propagation for ops that don't take any TensorView operands, e.g.,
diff --git a/csrc/preseg_passes/reorder_sharded_axis.cpp b/csrc/preseg_passes/reorder_sharded_axis.cpp
index f6359cb424e..bf68f6aa9c5 100644
--- a/csrc/preseg_passes/reorder_sharded_axis.cpp
+++ b/csrc/preseg_passes/reorder_sharded_axis.cpp
@@ -25,7 +25,7 @@ void ReorderShardedAxisPass::runPass(Fusion* fusion) {
   const std::vector<Expr*>& exprs = fusion->exprs();
   for (auto it = std::rbegin(exprs); it != std::rend(exprs); it++) {
     Expr* expr = *it;
-    if (!isResharding(expr)) {
+    if (HostIrLower::canLower(expr)) {
       continue;
     }
     NVF_ERROR(
diff --git a/csrc/preseg_passes/translate_repeat_to_expand.cpp b/csrc/preseg_passes/translate_repeat_to_expand.cpp
index 382dcb85f52..19c1274bf83 100644
--- a/csrc/preseg_passes/translate_repeat_to_expand.cpp
+++ b/csrc/preseg_passes/translate_repeat_to_expand.cpp
@@ -124,13 +124,11 @@ class RepeatToExpandTranslator {
     }
   }
 
-  // For each detected repetition:
-  //
-  // Step 1. Insert a broadcast ID immediately outside of the
-  // repeated ID
-  // Step 2. Expand the broadcast ID by the repetition factor
-  // Step 3. Flatten the expanded ID and the repeated ID
+  // For each detected repetition, replace the output with a repeat
+  // output.
   void translate() {
+    FusionGuard fg(fusion_);
+
     const auto exprs = fusion_->exprs();
     // Apply the translation in a reverse topological order. Since the
     // output of the repetition is replaced, the use exprs of the
@@ -145,36 +143,22 @@ class RepeatToExpandTranslator {
 
       const auto& info = repeat_info_map_it->second;
 
-      if (info.cat_inp_tvs.size() < 2) {
-        continue;
-      }
-
-      auto original_out_tv = expr->output(0)->as<TensorView>();
+      const auto num_repetitions = (int64_t)info.cat_inp_tvs.size();
 
-      // Step 1
-      auto inp_domain =
+      const auto inp_domain =
           TensorDomain::noReductions(info.input_tv->getLogicalDomain());
-      std::vector<bool> bcast_flags(inp_domain.size() + 1, false);
-      auto repeated_id_offset = std::distance(
-          inp_domain.begin(),
-          std::find(inp_domain.begin(), inp_domain.end(), info.repeated_id));
-      bcast_flags.at(repeated_id_offset) = true;
-      auto broadcast_tv = broadcast(info.input_tv, bcast_flags);
-      NVF_ERROR((size_t)broadcast_tv->nDims() == inp_domain.size() + 1);
-
-      // Step 2
-      std::vector<Val*> expanded_sizes(
-          bcast_flags.size(), IrBuilder::create<Val>(-1L));
-      expanded_sizes.at(repeated_id_offset) =
-          IrBuilder::create<Val>((int64_t)info.cat_inp_tvs.size());
-      auto expanded_tv = expand(broadcast_tv, expanded_sizes);
-
-      // Step 3
-      auto flattened_tv =
-          flatten(expanded_tv, repeated_id_offset, repeated_id_offset + 1);
+
+      std::vector<int64_t> repeated_times(inp_domain.size(), 1);
+      auto repeated_id_it =
+          std::find(inp_domain.begin(), inp_domain.end(), info.repeated_id);
+      NVF_ERROR(repeated_id_it != inp_domain.end());
+      auto repeated_dim = std::distance(inp_domain.begin(), repeated_id_it);
+      repeated_times.at(repeated_dim) = num_repetitions;
+
+      TensorView* replacement_tv = repeat(info.input_tv, repeated_times);
 
       ir_utils::replaceValInAllExprInputsAndFusionOutputs(
-          original_out_tv, flattened_tv);
+          expr->output(0), replacement_tv);
     }
   }
 
diff --git a/csrc/python_frontend/communicator_bindings.cpp b/csrc/python_frontend/communicator_bindings.cpp
new file mode 100644
index 00000000000..a2fb632b447
--- /dev/null
+++ b/csrc/python_frontend/communicator_bindings.cpp
@@ -0,0 +1,50 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <python_frontend/python_bindings.h>
+
+#include <multidevice/communicator.h>
+
+namespace nvfuser::python_frontend {
+
+void bindCommunicator(py::module& nvfuser) {
+  // py::nodelete is necessary because Communicator doesn't have a destructor:
+  // https://pybind11.readthedocs.io/en/stable/advanced/classes.html#non-public-destructors
+  py::class_<Communicator, std::unique_ptr<Communicator, py::nodelete>>
+      communicator(nvfuser, "Communicator");
+  communicator.def(
+      "instance",
+      &Communicator::getInstance,
+      "Returns the singleton communicator instance.",
+      py::return_value_policy::reference);
+  communicator.def(
+      "size",
+      &Communicator::size,
+      "Returns the number of processes in the communicator.");
+  communicator.def(
+      "rank",
+      &Communicator::deviceId,
+      "Returns the device ID associated with the current process.");
+  communicator.def(
+      "local_size",
+      &Communicator::local_size,
+      "Returns the number of processes within the node.");
+  communicator.def(
+      "local_rank",
+      &Communicator::local_rank,
+      "Returns the in-node rank associated with the current process.");
+  communicator.def(
+      "barrier",
+      [](Communicator& self) {
+        // Communicator::barrier takes an optional backend argument, which we
+        // don't use yet.
+        self.barrier();
+      },
+      "Performs a blocking barrier across all ranks.");
+}
+
+} // namespace nvfuser::python_frontend
diff --git a/csrc/python_frontend/python_bindings.cpp b/csrc/python_frontend/python_bindings.cpp
index ea061b094f1..d20105e990a 100644
--- a/csrc/python_frontend/python_bindings.cpp
+++ b/csrc/python_frontend/python_bindings.cpp
@@ -997,7 +997,13 @@ void initNvFuserPythonBindings(PyObject* module) {
     return ss.str();
   });
   tensor_class.def_property_readonly(
-      "ndim", [](Tensor& self) { return self.dims; });
+      "ndim",
+      [](Tensor& self) { return self.dims; },
+      "Returns the rank of the tensor.");
+  tensor_class.def_property_readonly(
+      "index",
+      [](Tensor& self) { return self.index; },
+      "Returns the index of the tensor as in FusionDefinition.sched.tensors().");
   tensor_class.def("_get_fusion_definition", [](Tensor& self) {
     return self.fusion_definition;
   });
@@ -1693,6 +1699,52 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_UNARY_OP("imag", imag)
 #undef NVFUSER_PYTHON_BINDING_UNARY_OP
 
+  nvf_ops.def(
+      "triu",
+      [](FusionDefinition::Operators& self,
+         Tensor input,
+         int64_t diagonal) -> Tensor {
+        FUSER_PERF_SCOPE("Operators.triu");
+        NVF_CHECK(
+            self.validUse(), "Attempting to add to a completed definition!");
+        FusionDefinition* fd = self.fusion_definition;
+        Tensor output = fd->defineTensor(input.dims);
+
+        auto diagonal_ = fd->defineScalar();
+        fd->defineRecord(new ScalarRecord(
+            {fd->recordingState(diagonal_())}, diagonal, DataType::Int, true));
+
+        fd->defineRecord(new OpRecord<TensorView*, TensorView*, Val*>(
+            {fd->recordingState(input()), fd->recordingState(diagonal_())},
+            {fd->recordingState(output())},
+            ("ops.triu"),
+            serde::RecordType::Binary_TV_VAL,
+            static_cast<TensorView* (*)(TensorView*, Val*)>(triu)));
+
+        return output;
+      },
+      py::arg("input"),
+      py::arg("diagonal") = 0,
+      py::return_value_policy::reference,
+      R"doc(
+    Returns the upper triangular part of a 2+D tensor.
+
+    Parameters
+    ----------
+    input : Tensor
+        The input tensor.
+    diagonal : int, optional
+        The diagonal to consider. Default is 0.
+
+    Returns
+    -------
+    Tensor
+        The upper triangular part of the input tensor.
+
+    >>> a = torch.randn(3, 3)
+    >>> fd.ops.triu(a)
+    )doc");
+
   // overload to
   nvf_ops.def(
       "stride_order",
@@ -3570,6 +3622,8 @@ void initNvFuserPythonBindings(PyObject* module) {
       py::return_value_policy::reference);
 
   bindSchedule(fusion_def);
+
+  bindCommunicator(nvfuser);
 }
 
 void cleanup() {
diff --git a/csrc/python_frontend/python_bindings.h b/csrc/python_frontend/python_bindings.h
index bd8f0347530..11039eea7ef 100644
--- a/csrc/python_frontend/python_bindings.h
+++ b/csrc/python_frontend/python_bindings.h
@@ -14,9 +14,13 @@
 #include <visibility.h>
 
 namespace nvfuser::python_frontend {
+
 NVF_API void initNvFuserPythonBindings(PyObject* module);
 
+void bindCommunicator(py::module& nvfuser);
+
 void bindSchedule(py::class_<FusionDefinition>& fusion_def);
 
 NVF_API void cleanup();
+
 } // namespace nvfuser::python_frontend
diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
index 04f86b1edd0..36a0f9fc966 100644
--- a/csrc/runtime/executor.cpp
+++ b/csrc/runtime/executor.cpp
@@ -461,7 +461,19 @@ void KernelExecutor::compile(
     }
   }
 
-  kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
+  // TODO: pass block_size here;
+  std::optional<int64_t> dynamic_smem = std::nullopt;
+  std::optional<int64_t> block_size = std::nullopt;
+  if (!args.empty()) {
+    auto expr_eval = executor_utils::bindInputs(args, kernel);
+    auto launch_params = computeLaunchParams(
+        launch_constraints, expr_eval, warp_size_, kernel->indexType());
+    block_size = launch_params.nThreads();
+    dynamic_smem = launch_params.smem();
+    NVF_ERROR(block_size > 0, "launch param inferred block size < 0");
+  }
+
+  kernel_code_ = codegen::generateCudaKernel(kernel, kernelName(), block_size);
 
   // If NVFUSER_EXTERNAL_SRC is set, utilize the external source code.
   // If the loaded external source code is empty, revert to the default codegen.
@@ -525,18 +537,6 @@ void KernelExecutor::compile(
     NVF_THROW(ss.str());
   }
 
-  // TODO: pass block_size here;
-  std::optional<int64_t> dynamic_smem = std::nullopt;
-  std::optional<int64_t> block_size = std::nullopt;
-  if (!args.empty()) {
-    auto expr_eval = executor_utils::bindInputs(args, kernel);
-    auto launch_params = computeLaunchParams(
-        launch_constraints, expr_eval, warp_size_, kernel->indexType());
-    block_size = launch_params.nThreads();
-    dynamic_smem = launch_params.smem();
-    NVF_ERROR(block_size > 0, "launch param inferred block size < 0");
-  }
-
   // TODO: high water mark should be computed via occupancy API after
   // compilation.
 
diff --git a/csrc/scheduler/no_op.h b/csrc/scheduler/no_op.h
index 3dbd53e8e0a..6f88f521bd5 100644
--- a/csrc/scheduler/no_op.h
+++ b/csrc/scheduler/no_op.h
@@ -45,24 +45,4 @@ class NoOpScheduler : public SchedulerEntry {
   }
 };
 
-//! Provides a dummy heuristic type to ensure
-//!  unified interface on NoOp scheduler.
-class NoOpHeuristic : public HeuristicParams {
- public:
-  using HeuristicParams::HeuristicParams;
-  NoOpHeuristic() : HeuristicParams(SchedulerType::NoOp) {};
-
-  size_t hash() const override {
-    return 0;
-  }
-  std::unique_ptr<HeuristicParams> clone() const override {
-    return std::make_unique<NoOpHeuristic>(*this);
-  }
-
-  bool sameAs(const HeuristicParams* other) const override {
-    auto other_casted = dynamic_cast<const NoOpHeuristic*>(other);
-    return other_casted != nullptr && other_casted->cparams == cparams;
-  };
-};
-
 } // namespace nvfuser
diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index 41277250b16..ef400301487 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -15,12 +15,16 @@
 #include <scheduler/pointwise_utils.h>
 #include <scheduler/registry_utils.h>
 #include <scheduler/resize.h>
+#include <scheduler/resize_heuristic.h>
 #include <scheduler/runtime_info.h>
 #include <scheduler/tools/inlining.h>
 #include <scheduler/tools/loop_domain_scheduler.h>
 #include <scheduler/tools/resize_utils.h>
+#include <scheduler/tools/static_repeat.h>
 #include <val_graph_visitor.h>
 
+#include <memory>
+
 namespace nvfuser {
 
 namespace {
@@ -30,6 +34,31 @@ TensorView* getReferenceTensor(Fusion* fusion) {
   return pointwise_utils::getReferenceTensor(fusion);
 }
 
+// Returns the largest tensor with its number of elements
+std::pair<TensorView*, int64_t> getLargestTensor(
+    const std::vector<Val*>& vals,
+    SchedulerRuntimeInfo& runtime_info) {
+  int64_t max_num_elms = -1;
+  TensorView* largest_tv = nullptr;
+  for (auto tv : ir_utils::filterByType<TensorView>(vals)) {
+    int64_t num_elms = 1;
+    for (auto logical_id : tv->getLogicalDomain()) {
+      auto inferred_val =
+          runtime_info.expressionEvaluator().evaluate(logical_id->extent());
+      NVF_ERROR(
+          inferred_val.hasValue(),
+          "Error inferring extent of: ",
+          logical_id->toString());
+      num_elms *= inferred_val.as<int64_t>();
+    }
+    if (num_elms > max_num_elms) {
+      largest_tv = tv;
+      max_num_elms = num_elms;
+    }
+  }
+  return std::make_pair(largest_tv, max_num_elms);
+}
+
 } // namespace
 
 bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
@@ -111,12 +140,10 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
     }
   }
 
-  // This doesn't work yet due to issue #3571
   auto ref_tv = getReferenceTensor(fusion);
-  if (std::any_of(
-          ref_tv->getLogicalDomain().begin(),
-          ref_tv->getLogicalDomain().end(),
-          [](IterDomain* logical_id) { return logical_id->isBroadcast(); })) {
+  if (ref_tv == nullptr) {
+    scheduler_debug_utils::canScheduleRejectReason(
+        schedulerType(), "No reference found");
     return false;
   }
 
@@ -158,10 +185,12 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
     }
   }
 
-  // Disable the scheduler if there's a squeeze op. The loop option
-  // may also need to be enabled in that case, but that option is not
-  // turned on automatically yet.
-  if (ir_utils::hasOpsOfType<SqueezeOp>(fusion)) {
+  // Skip transpose-like patterns for now
+  scheduler_tools::TransposeDomainMap domain_map(fusion);
+  auto grouped_inputs_outputs = domain_map.groupInputsOutputsByInnerDim();
+  if (grouped_inputs_outputs.size() >= 2) {
+    scheduler_debug_utils::canScheduleRejectReason(
+        schedulerType(), "Transpose-like patterns not supported.");
     return false;
   }
 
@@ -173,8 +202,52 @@ std::unique_ptr<HeuristicParams> ResizeScheduler::computeHeuristics(
     SchedulerRuntimeInfo& runtime_info,
     HeuristicDataCache* data_cache) {
   FUSER_PERF_SCOPE("ResizeScheduler::computeHeuristics");
-  auto params = std::make_unique<HeuristicParams>(SchedulerType::Resize);
+  auto params = std::make_unique<ResizeParams>(SchedulerType::Resize);
+  params->tag = "Resize heuristics";
   params->cparams.index_type = runtime_info.getIndexType();
+
+  const int64_t bdimx = 128;
+
+  const auto& [largest_output, max_num_elms] =
+      getLargestTensor(fusion->outputs(), runtime_info);
+
+  params->split_grid_x_dim =
+      ceilDiv(max_num_elms, bdimx) > ResizeParams::max_gdimx;
+
+  const auto largest_input =
+      getLargestTensor(fusion->inputs(), runtime_info).first;
+  if (largest_input != nullptr) {
+    int64_t index_of_largest_input = std::distance(
+        fusion->inputs().begin(),
+        std::find(
+            fusion->inputs().begin(), fusion->inputs().end(), largest_input));
+    params->largest_input = index_of_largest_input;
+  } else {
+    params->largest_input = -1;
+  }
+
+  auto ref_tv_entry =
+      HeuristicDataCacheEntry<HeuristicCompileTime::ReferenceTensors>(
+          data_cache, [fusion]() {
+            std::vector<TensorView*> data{getReferenceTensor(fusion)};
+            return std::make_unique<std::vector<TensorView*>>(std::move(data));
+          });
+  TensorView* ref_tv = ref_tv_entry.get()[0];
+
+  // Before applying the vectorization split, any reshape transform of
+  // the largest input will be cancelled whenever possible, so the
+  // largest input is used as the reference of vectorization.
+  auto vec_ref_tv = largest_input != nullptr ? largest_input : ref_tv;
+
+  // Only consider the innermost dimension to vectorize for now.
+  // TODO: Consider vectorizing merged IDs, not just the innermost
+  params->vectorization_factor = vectorize_helper::getVectorizationFactor(
+      runtime_info,
+      vec_ref_tv,
+      data_cache,
+      (int64_t)vec_ref_tv->getLogicalDomain().size() - 1,
+      {});
+
   return params;
 }
 
@@ -182,21 +255,27 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
   FUSER_PERF_SCOPE("ResizeScheduler::schedule");
 
   FusionGuard fg(fusion);
+  const auto resize_params = dynamic_cast<const ResizeParams*>(params);
+  NVF_ERROR(resize_params != nullptr);
 
   scheduler_utils::clearMemorySpace(fusion);
 
+  auto ref_tv = getReferenceTensor(fusion);
+  NVF_ERROR(ref_tv != nullptr);
+
   scheduler_utils::cacheInputs(fusion, true);
   scheduler_utils::cacheAndForkOutputs(fusion, true);
 
   auto resize_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
 
-  IdModel id_model(fusion, /*build_graphs=*/false);
-  const auto& exact_graph = id_model.buildExactGraph();
+  std::unique_ptr<IdModel> id_model =
+      std::make_unique<IdModel>(fusion, /*build_graphs=*/false);
+  id_model->buildExactGraph();
 
   // Replicate resize inputs if necessary to avoid conflicting
   // propagations
   const auto exclusivity_info_map = scheduler_tools::getNonExclusiveResizeInfo(
-      resize_tensor_ops, exact_graph);
+      resize_tensor_ops, id_model->idGraph(IdMappingMode::EXACT));
   for (auto resize_tensor_op : resize_tensor_ops) {
     auto out_tv = resize_tensor_op->output(0)->as<TensorView>();
     if (exclusivity_info_map.count(out_tv) == 0) {
@@ -213,6 +292,17 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
     ir_utils::replaceValInExprInputs(resize_tensor_op, inp_tv, inp_tv_copy);
   }
 
+  TensorView* largest_input = nullptr;
+  if (resize_params->largest_input >= 0) {
+    largest_input =
+        fusion->inputs().at(resize_params->largest_input)->as<TensorView>();
+
+    // The tensors are going to be reordered to align with the largest
+    // input. To make it work, merge operations for reshape should be
+    // cancelled.
+    scheduler_tools::cancelReshapeInLoopDomains(largest_input);
+  }
+
   for (auto expr : fusion->exprs()) {
     if (!expr->isOneOf<SliceOp, PadOp>()) {
       continue;
@@ -221,31 +311,184 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
     scheduler_tools::propagateResizeToInputs(expr);
   }
 
-  auto ref_tv = getReferenceTensor(fusion);
+  // Update the IdModel
+  id_model = std::make_unique<IdModel>(fusion, /*build_graphs=*/false);
+  id_model->buildExactGraph();
+
+  // Detect an ending repeat
+  auto static_repeat_info = scheduler_tools::getMaybeStaticRepeatInfo(ref_tv);
 
   // Just simple scheduling for now.
   // TODO: Do something smarter. Can just use the pointwise scheduler?
 
+  // Reorder tensors to align with the largest input. This is expected
+  // to improve the memory read performance, while the write
+  // performance could be lowered. This should generally be more
+  // important to optimize the read performance, but more robust
+  // decision would be needed.
+  if (largest_input != nullptr) {
+    std::vector<IterDomain*> ref_alloc;
+    ref_alloc.reserve(largest_input->getMaybeAllocationDomain().size());
+    std::copy_if(
+        largest_input->getMaybeAllocationDomain().begin(),
+        largest_input->getMaybeAllocationDomain().end(),
+        std::back_inserter(ref_alloc),
+        [](IterDomain* alloc_id) {
+          return !alloc_id->isBroadcast() && !alloc_id->isReduction() &&
+              !alloc_id->isDeviceDim();
+        });
+
+    // Reorder the reference as the allocation domain of the largest fusion
+    // input
+    scheduler_utils::reorderTensorLike(ref_tv, ref_alloc);
+  }
+
+  const int64_t bdimx = 128;
+
   // Make sure the DID ID located at the outermost position
-  const auto outermost_pos = scheduler_utils::reorderDevicesToOuter(ref_tv);
+  auto outermost_pos = scheduler_utils::reorderDevicesToOuter(ref_tv);
+
+  // [DID, ..., ...]
+  //        ^
+  //        +--- outermost_pos
+
+  // Move the static repeat ID to the outermost position if
+  // detected. The repeat ID then just remains there with no
+  // scheduling.
+  bool repeat_id_moved_to_outermost = false;
+  if (static_repeat_info.has_value()) {
+    NVF_ERROR(ref_tv == static_repeat_info->repeat_output_tv);
+    auto ref_repeat_id_it = std::find_if(
+        ref_tv->getLoopDomain().begin(),
+        ref_tv->getLoopDomain().end(),
+        [&](IterDomain* loop_id) {
+          return id_model->idGraph(IdMappingMode::EXACT)
+              .disjointValSets()
+              .strictAreMapped(loop_id, static_repeat_info->reshape_repeat_id);
+        });
+    // Gives up if the repeat ID is not found. Unclear if this could
+    // actually happen, though.
+    if (ref_repeat_id_it != ref_tv->getLoopDomain().end()) {
+      auto repeat_id_pos =
+          std::distance(ref_tv->getLoopDomain().begin(), ref_repeat_id_it);
+      NVF_ERROR(
+          repeat_id_pos >= outermost_pos,
+          "Unexpected to have DID-parallelized repeat axis: ",
+          static_repeat_info->reshape_repeat_id->toString());
+
+      // [DID, ..., repeat_id, ...]
+      //        ^
+      //        +--- outermost_pos
+      ref_tv->reorder(std::unordered_map<int64_t, int64_t>{{repeat_id_pos, 0}});
+      ++outermost_pos;
+      // [repeat_id, DID, ...]
+      //                   ^
+      //                   +--- outermost_pos
+
+      repeat_id_moved_to_outermost = true;
+    }
+  }
+
+  const int64_t vec_factor = resize_params->vectorization_factor;
 
-  // Schedule only the remaining IDs
-  ref_tv->flatten(outermost_pos);
-  ref_tv->split(outermost_pos, 128);
-  ref_tv->split(outermost_pos, 1 << 14);
-  ref_tv->axis(-1)->parallelize(ParallelType::TIDx);
-  ref_tv->axis(-2)->parallelize(ParallelType::BIDx);
+  int64_t next_innermost_pos = -1;
+  // [..., ...]
+  //        ^
+  //        +--- next_innermost_pos
+
+  if (vec_factor > 1) {
+    ref_tv->split(-1, vec_factor);
+    --next_innermost_pos;
+    // [..., vec_factor]
+    //   ^
+    //   +--- next_innermost_pos
+  }
+
+  ref_tv->flatten(outermost_pos, next_innermost_pos);
+  // [..., I0, vec_factor]
+  //       ^
+  //       +--- next_innermost_pos
+
+  ref_tv->split(next_innermost_pos, bdimx);
+  ref_tv->axis(next_innermost_pos)->parallelize(ParallelType::TIDx);
+  --next_innermost_pos;
+  // [..., I0/bdimx, bdimx(TIDx), vec_factor]
+  //         ^
+  //         +--- next_innermost_pos
+
+  if (resize_params->split_grid_x_dim) {
+    ref_tv->split(next_innermost_pos, ResizeParams::max_gdimx);
+    // [..., I0/bdimx/max_gdimx, max_gdimx, bdimx(TIDx), vec_factor]
+  }
+  ref_tv->axis(next_innermost_pos)->parallelize(ParallelType::BIDx);
+  // [..., I0/bdimx/max_gdimx, max_gdimx(BIDx), bdimx(TIDx), vec_factor] or
+  // [..., I0/bdimx(BIDx), bdimx(TIDx), vec_factor]
 
   // Propagate the reference to the other tensors. Note that the
-  // update flag is enabled so to workaround the resize propagation
+  // update flag is enabled to workaround the resize propagation
   // issue. This may not work if there's a tensor that is reshaped
   // from the reference tensor, but that should not be the case as the
   // reference is picked by the same routine used for the pointwise
   // scheduler.
-  scheduler_tools::scheduleLoopDomainsLike(
-      fusion->allTvs(),
-      ref_tv->getLoopDomain(),
-      /*update_loop_domain_only=*/true);
+  //
+  // When an ending static repeat is detected and the repeat ID is
+  // moved to the outermost position, propagation is done separately
+  // between the tensors before the repeat and after the repeat. The
+  // tensors are first grouped into the pre-repeat group and the
+  // post-repeat group, where only the latter group has the repeat
+  // IDs. When propagating the loop domain of the reference tensor,
+  // which has the repeat ID, the full loop domain is propagated only
+  // to the post-repeat group. For the pre-repeat group, the repeat ID
+  // is dropped and only the remaining loop domain is propagated.
+  if (repeat_id_moved_to_outermost) {
+    // Divide all tvs to the pre and posgt repeat groups
+    auto all_tvs = fusion->allTvs();
+    std::vector<TensorView*> post_repeat_tvs;
+    post_repeat_tvs.reserve(static_repeat_info->repeat_tvs.size());
+    std::vector<TensorView*> pre_repeat_tvs;
+    pre_repeat_tvs.reserve(
+        all_tvs.size() - static_repeat_info->repeat_tvs.size());
+    for (auto tv : all_tvs) {
+      if (static_repeat_info->repeat_tvs.count(tv)) {
+        post_repeat_tvs.push_back(tv);
+      } else {
+        pre_repeat_tvs.push_back(tv);
+      }
+    }
+
+    // The repeat ID should be located at the outermost position
+    std::vector<IterDomain*> non_repeated_loop{
+        ref_tv->getLoopDomain().begin() + 1, ref_tv->getLoopDomain().end()};
+
+    scheduler_tools::scheduleLoopDomainsLike(
+        pre_repeat_tvs,
+        non_repeated_loop,
+        /*update_loop_domain_only=*/true);
+    scheduler_tools::scheduleLoopDomainsLike(
+        post_repeat_tvs,
+        ref_tv->getLoopDomain(),
+        /*update_loop_domain_only=*/true);
+  } else {
+    scheduler_tools::scheduleLoopDomainsLike(
+        fusion->allTvs(),
+        ref_tv->getLoopDomain(),
+        /*update_loop_domain_only=*/true);
+  }
+
+  if (vec_factor > 1) {
+    auto vec_ref_tv = largest_input != nullptr ? largest_input : ref_tv;
+    const auto tvs_to_vectorize =
+        scheduler_utils::getInputsOutputsWithInnerDim(vec_ref_tv, true, true);
+    for (auto tv_to_vectorize : tvs_to_vectorize) {
+      if (tv_to_vectorize->isFusionInput()) {
+        for (auto consumer_tv : ir_utils::consumerTvsOf(tv_to_vectorize)) {
+          consumer_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+        }
+      } else {
+        tv_to_vectorize->axis(-1)->parallelize(ParallelType::Vectorize);
+      }
+    }
+  }
 
   inlineMost();
 
diff --git a/csrc/scheduler/resize_heuristic.h b/csrc/scheduler/resize_heuristic.h
new file mode 100644
index 00000000000..14ba0d2af72
--- /dev/null
+++ b/csrc/scheduler/resize_heuristic.h
@@ -0,0 +1,67 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <c10/util/hash.h>
+#include <ir/interface_nodes.h>
+#include <scheduler/heuristic.h>
+#include <utils.h>
+
+#include <sstream>
+
+namespace nvfuser {
+
+class ResizeParams : public HeuristicParams {
+ public:
+  ResizeParams() : HeuristicParams(SchedulerType::Resize) {};
+
+  // Split grid x dimension
+  bool split_grid_x_dim = false;
+
+  int64_t largest_input = -1;
+
+  int64_t vectorization_factor = 1;
+
+  static constexpr int64_t max_gdimx = (1L << 31) - 1L;
+
+  using HeuristicParams::HeuristicParams;
+
+  // Warning: Does not check launch parameters!
+  bool sameAs(const HeuristicParams* other_base) const override {
+    auto other = dynamic_cast<const ResizeParams*>(other_base);
+    if (other == nullptr) {
+      return false;
+    }
+    bool attr_equal = other->cparams == cparams &&
+        other->split_grid_x_dim == split_grid_x_dim &&
+        other->largest_input == largest_input &&
+        other->vectorization_factor == vectorization_factor;
+    return attr_equal;
+  }
+
+  std::string toString() const override {
+    std::stringstream ss;
+    ss << "\n===== Resize Parameters ========\n"
+       << (tag.empty() ? "" : "Tag: ") << tag << " Resize Characteristics:\n"
+       << " split grid x dim: " << split_grid_x_dim << "\n"
+       << " index of largest input: " << largest_input << "\n"
+       << " vectorization factor: " << vectorization_factor << "\n";
+    ss << "====================================\n";
+    return ss.str();
+  }
+
+  size_t hash() const override {
+    return c10::get_hash(split_grid_x_dim);
+  }
+
+  std::unique_ptr<HeuristicParams> clone() const override {
+    return std::make_unique<ResizeParams>(*this);
+  }
+};
+
+} // namespace nvfuser
diff --git a/csrc/scheduler/tools/loop_domain_scheduler.cpp b/csrc/scheduler/tools/loop_domain_scheduler.cpp
index fd7f2a01240..daa7c92eacc 100644
--- a/csrc/scheduler/tools/loop_domain_scheduler.cpp
+++ b/csrc/scheduler/tools/loop_domain_scheduler.cpp
@@ -10,6 +10,7 @@
 #include <id_model/id_model.h>
 #include <id_model/schedule.h>
 #include <ir/internal_nodes.h>
+#include <ir/utils.h>
 #include <scheduler/tools/loop_domain_scheduler.h>
 #include <val_graph_visitor.h>
 
@@ -407,7 +408,8 @@ void scheduleLoopDomainsLike(
 
 void scheduleLoopDomainsBy(
     const std::vector<TensorView*>& tvs,
-    Expr* transform) {
+    Expr* transform,
+    Direction replay_dir) {
   Fusion* fusion = transform->fusion();
   IdModel id_model(fusion, /*build_graphs=*/false);
   const ValGraph& exact_graph = id_model.buildExactGraph();
@@ -439,17 +441,19 @@ void scheduleLoopDomainsBy(
       }
     }
 
-    Direction replay_dir = Direction::Undefined;
-
     // It should be either: all of the inputs found and none of the
     // outputs found, or none of the inputs found and all of the
     // outputs found.
-    if (input_ids.size() == transform->inputs().size()) {
+    Direction replay_dir_tv = Direction::Undefined;
+    if (replay_dir != Direction::Backward &&
+        input_ids.size() == transform->inputs().size()) {
       NVF_ERROR(output_ids.empty());
-      replay_dir = Direction::Forward;
-    } else if (output_ids.size() == transform->outputs().size()) {
+      replay_dir_tv = Direction::Forward;
+    } else if (
+        replay_dir != Direction::Forward &&
+        output_ids.size() == transform->outputs().size()) {
       NVF_ERROR(input_ids.empty());
-      replay_dir = Direction::Backward;
+      replay_dir_tv = Direction::Backward;
     } else {
       // Replay not possible since none of inputs nor outputs are connected with
       // the transform
@@ -457,11 +461,12 @@ void scheduleLoopDomainsBy(
     }
 
     const auto& existing_ids =
-        replay_dir == Direction::Forward ? input_ids : output_ids;
+        replay_dir_tv == Direction::Forward ? input_ids : output_ids;
 
     // Clone inputs or outputs
-    auto& new_ids = replay_dir == Direction::Forward ? output_ids : input_ids;
-    const auto& ref_of_ids_to_generate = replay_dir == Direction::Forward
+    auto& new_ids =
+        replay_dir_tv == Direction::Forward ? output_ids : input_ids;
+    const auto& ref_of_ids_to_generate = replay_dir_tv == Direction::Forward
         ? transform->outputs()
         : transform->inputs();
 
@@ -500,5 +505,162 @@ void scheduleLoopDomainsBy(
   return;
 }
 
+void cancelReshapeInLoopDomains(TensorView* from_tv) {
+  Fusion* fusion = from_tv->fusion();
+  IdModel id_model(fusion, /*build_graphs=*/false);
+  id_model.buildExactGraph();
+  const auto& exact_graph = id_model.idGraph(IdMappingMode::EXACT);
+
+  // Reshapes producing these IDs should not be cancelled
+  ValGroups reshape_dependent_ids;
+  for (const ExprGroup& expr_g :
+       exact_graph.disjointExprSets().disjointSets()) {
+    if (expr_g->front()->isA<Resize>()) {
+      reshape_dependent_ids.pushBack(exact_graph.inputGroups(expr_g));
+    }
+  }
+
+  for (const ValGroup& val_g : exact_graph.disjointValSets().disjointSets()) {
+    if (std::any_of(val_g->begin(), val_g->end(), [](Val* val) {
+          NVF_ERROR(val->isA<IterDomain>());
+          return val->as<IterDomain>()->isReduction();
+        })) {
+      reshape_dependent_ids.pushBack(val_g);
+    }
+  }
+
+  auto all_dep_exprs_from_tv =
+      DependencyCheck::getAllExprsBetween({from_tv}, fusion->outputs());
+
+  // Visit all reshapes in a reverse topological order
+  for (auto exprs_it = all_dep_exprs_from_tv.rbegin();
+       exprs_it != all_dep_exprs_from_tv.rend();
+       ++exprs_it) {
+    auto reshape = dynamic_cast<ViewOp*>(*exprs_it);
+    if (reshape == nullptr) {
+      continue;
+    }
+
+    auto reshape_out = reshape->out();
+
+    auto all_dep_vals =
+        DependencyCheck::getAllValsBetween({reshape_out}, fusion->outputs());
+    // Exclude reshape_out. These tensors are going to be updated by
+    // replaying the reshape transform exprs using
+    // scheduleLoopDomainsBy. Since the reshape output
+    // tensor already has the exprs, replaying with
+    // scheduleLoopDomainsBy would complain if not excluded. For the
+    // reshape output tensor, setLoopDomain is done with the existing
+    // IDs without replaying.
+    all_dep_vals.erase(all_dep_vals.begin());
+    auto all_dep_tvs = ir_utils::filterByType<TensorView>(all_dep_vals);
+
+    // Find logical IDs that do not exist in the root domain. They are
+    // the new IDs that are produced by this reshape op. If a logical
+    // ID is already found in the root domain, there's nothing to do
+    // for it.
+    std::vector<IterDomain*> new_logical_ids;
+    for (const auto& logical_id : reshape_out->getLogicalDomain()) {
+      if (!reshape_out->domain()->isRoot(logical_id)) {
+        new_logical_ids.push_back(logical_id);
+      }
+    }
+
+    if (new_logical_ids.empty()) {
+      // Nothing to do with a no-op reshape. This may not happen.
+      continue;
+    }
+
+    // Find logical IDs that do not need to exist in the loop domain
+    std::unordered_set<Val*> cancellable_ids;
+    for (const auto new_logical_id : new_logical_ids) {
+      auto new_id_group = exact_graph.toGroup(new_logical_id);
+      // Not cancellable if used by resize or reduced.
+      auto reachable_exprs = getReachableNodesFrom<ValGraphPermissiveBFS>(
+          {new_id_group},
+          {reshape_dependent_ids.begin(), reshape_dependent_ids.end()},
+          Direction::Forward,
+          exact_graph);
+      if (!reachable_exprs.empty()) {
+        continue;
+      }
+
+      cancellable_ids.insert(new_logical_id);
+    }
+
+    if (cancellable_ids.empty()) {
+      continue;
+    }
+
+    // Update the loop domain by each of the reshape exprs in a
+    // reverse topological order.
+    auto reshape_exprs = DependencyCheck::getAllExprsBetween(
+        {reshape_out->getRootDomain().begin(),
+         reshape_out->getRootDomain().end()},
+        {reshape_out->getLogicalDomain().begin(),
+         reshape_out->getLogicalDomain().end()});
+
+    auto reshape_out_loop_domain = reshape_out->getLoopDomain();
+
+    for (auto reshape_exprs_it = reshape_exprs.rbegin();
+         reshape_exprs_it != reshape_exprs.rend();
+         ++reshape_exprs_it) {
+      auto reshape_expr = *reshape_exprs_it;
+
+      // If any of the output IDs of reshape_expr is not found in
+      // cancellable_ids, that means the expr cannot be cancelled.
+      if (std::any_of(
+              reshape_expr->outputs().begin(),
+              reshape_expr->outputs().end(),
+              [&](Val* reshape_expr_out) -> bool {
+                return !cancellable_ids.count(reshape_expr_out);
+              })) {
+        continue;
+      }
+
+      // Update all of the dependent TVs by this reshape expr
+      scheduleLoopDomainsBy(
+          all_dep_tvs.vector(), reshape_expr, Direction::Backward);
+
+      cancellable_ids.insert(
+          reshape_expr->inputs().begin(), reshape_expr->inputs().end());
+
+      // For the reshape output tensor itself, since it already has the
+      // reshape expr, it just needs
+      // tv->setLoopDomain(tv->getRootDomain()). However, since some of the
+      // reshape exprs may not be cancellable, update a vector of the
+      // loop IDs for each of the cancelled exprs individually and use
+      // it to set the loop domain of the reshape output tensor
+
+      // Insert the input IDs to the loop domain
+      auto insert_pos = std::find(
+          reshape_out_loop_domain.begin(),
+          reshape_out_loop_domain.end(),
+          reshape_expr->outputs().front());
+      NVF_ERROR(insert_pos != reshape_out_loop_domain.end());
+      for (auto inp : reshape_expr->inputs()) {
+        insert_pos =
+            reshape_out_loop_domain.insert(insert_pos, inp->as<IterDomain>());
+        ++insert_pos;
+      }
+
+      // Remove the output IDs
+      reshape_out_loop_domain.erase(
+          std::remove_if(
+              reshape_out_loop_domain.begin(),
+              reshape_out_loop_domain.end(),
+              [&](IterDomain* cur_loop_id) {
+                return std::find(
+                           reshape_expr->outputs().begin(),
+                           reshape_expr->outputs().end(),
+                           cur_loop_id) != reshape_expr->outputs().end();
+              }),
+          reshape_out_loop_domain.end());
+    }
+
+    reshape_out->setLoopDomain(reshape_out_loop_domain);
+  }
+}
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/csrc/scheduler/tools/loop_domain_scheduler.h b/csrc/scheduler/tools/loop_domain_scheduler.h
index 5939c9d31e2..fa0d4e0d2ae 100644
--- a/csrc/scheduler/tools/loop_domain_scheduler.h
+++ b/csrc/scheduler/tools/loop_domain_scheduler.h
@@ -7,13 +7,17 @@
 // clang-format on
 #pragma once
 
+#include <bfs.h>
+
 #include <vector>
 
 namespace nvfuser {
 
 class Expr;
+class Fusion;
 class TensorView;
 class IterDomain;
+class ViewOp;
 
 namespace scheduler_tools {
 
@@ -30,14 +34,14 @@ void scheduleLoopDomainsLike(
     bool update_loop_domain_only = false);
 
 // Replay a transform expr on the loop domain of each of the given
-// tensors. If the input of the transform is exact mapped with the loop
-// domain, the transform is replayed as a forward op. If the output
-// is exact mapped with the loop domain, it's replayed as a backward
-// op. The loop domain of each tensor is updated with the replayed
-// transform expr. If it's replayed as a forward op, the outputs
-// replace the inputs in the loop domain. If it's replayed as a
-// backward op, the inputs replace the outputs in the loop domain. The
-// new IDs are inserted at the outermost position of the input IDs.
+// tensors. If the replay direction is specified, the expr is replayed
+// as specified. Otherwise, if the input of the transform is exact mapped with
+// the loop domain, the transform is replayed as a forward op. If the output is
+// exact mapped with the loop domain, it's replayed as a backward op. The loop
+// domain of each tensor is updated with the replayed transform expr. If it's
+// replayed as a forward op, the outputs replace the inputs in the loop domain.
+// If it's replayed as a backward op, the inputs replace the outputs in the loop
+// domain. The new IDs are inserted at the outermost position of the input IDs.
 //
 // For example, suppose a fusion has:
 //
@@ -62,7 +66,48 @@ void scheduleLoopDomainsLike(
 // LoopDomainSchedulingTest.ScheduleLoopDomainsBy1 for more examples.
 void scheduleLoopDomainsBy(
     const std::vector<TensorView*>& tvs,
-    Expr* transform);
+    Expr* transform,
+    Direction replay_dir = Direction::Undefined);
+
+// For each of immediate and indirect consumer tensors of from_tv,
+// schedule its loop domain such that reshape transforms appearing
+// between the tensor and from_tv are cancelled. For example, suppose
+// a fusion has:
+//
+// t0 = makeSymbolicTensor(3); // [i0, i1, i2]
+// t1 = permute(t0, {1, 0, 2}); // [i1, i0, i2]
+// t2 = reshape(t1, {i1, i0*i2}); // [i1, i0*i2]
+// t3 = sin(t2) // [i1, i0*i2]
+//
+// In this case, cancelReshapeInLoopDomains(t0) would affect t2 and t3
+// as follows:
+//
+// t2:
+//  root: [i1, i0*i2] (unchanged)
+//  logical: [i1, i0*i2] (unchanged)
+//  loop: [i1, i0, i2]
+//
+// t3:
+//  logical: [i1, i0*i2] (unchanged)
+//  loop: [i1, i0, i2]
+//
+// t1 would not be changed at all as there's no reshape between t0 and
+// t1.
+//
+// This scheduling could help optimize memory accesses to
+// fusion inputs. In the above case, we could then reorder the loop
+// domains of t1, t2 and t3 as [i0, i1, i2], i.e., the same ordering
+// as t0, which could minimize strided accesses.
+//
+// This scheduling is not always feasible. Specifically, if a reshape
+// output iter domain is resized, the loop domain needs to keep using
+// the reshape output iter domain. Similarly, if a rehape output iter
+// domain is reduced, the reshape is currently not cancelled. This is
+// because if a reshape has a split and only one of the split output
+// iter domain is reduced, the split needs to remain. If a reshape
+// only consists of merge transforms, cancellation should be possible,
+// but that is not currently supported.
+void cancelReshapeInLoopDomains(TensorView* from_tv);
 
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/csrc/scheduler/tools/static_repeat.cpp b/csrc/scheduler/tools/static_repeat.cpp
new file mode 100644
index 00000000000..d438cf292b6
--- /dev/null
+++ b/csrc/scheduler/tools/static_repeat.cpp
@@ -0,0 +1,166 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <ir/all_nodes.h>
+#include <ir/utils.h>
+#include <scheduler/tools/static_repeat.h>
+
+namespace nvfuser {
+namespace scheduler_tools {
+
+std::optional<StaticRepeatInfo> getMaybeStaticRepeatInfo(
+    TensorView* maybe_repeat_out) {
+  // The pattern to detect:
+  //
+  // broadcast_out = broadcast(input)
+  // expand_out = expand(broadcast_out)
+  // repeat_out = reshape(expand_out)
+  //
+  // Additionally, since maybe_repeat_out is commonly a fusion
+  // output, it is likely there's a cache tv between expand_out and
+  // repeat_out, so the following pattern should also be detected.
+  //
+  // broadcast_out = broadcast(input)
+  // expand_out = expand(broadcast_out)
+  // cache_of_repeat_out = reshape(expand_out)
+  // repeat_out = set(cache_of_repeat_out)
+
+  std::unordered_set<TensorView*> repeat_tvs;
+  repeat_tvs.insert(maybe_repeat_out);
+
+  auto reshape_out = maybe_repeat_out;
+
+  // Check if there's a cache
+  if (auto ldst = dynamic_cast<LoadStoreOp*>(maybe_repeat_out->definition());
+      ldst->opType() == LoadStoreOpType::Set) {
+    reshape_out = ldst->in()->as<TensorView>();
+    repeat_tvs.insert(reshape_out);
+  }
+
+  // Detect reshape
+  auto reshape = dynamic_cast<ViewOp*>(reshape_out->definition());
+  if (reshape == nullptr) {
+    return std::nullopt;
+  }
+
+  // Detect expand
+  auto expand_out = reshape->in();
+  repeat_tvs.insert(expand_out);
+  auto expand = dynamic_cast<ExpandOp*>(expand_out->definition());
+  if (expand == nullptr) {
+    return std::nullopt;
+  }
+
+  // Detect broadcast
+  auto broadcast_out = expand->in();
+  repeat_tvs.insert(broadcast_out);
+  auto broadcast = dynamic_cast<BroadcastOp*>(broadcast_out->definition());
+  if (broadcast == nullptr) {
+    return std::nullopt;
+  }
+
+  auto inp_tv = broadcast->in();
+
+  // Not sure if this is really necessary to check, but assume there's
+  // only single chain of the ops and tensors from inp_tv to
+  // maybe_reshape_out
+  if (inp_tv->uses().size() > 1 &&
+      std::any_of(repeat_tvs.begin(), repeat_tvs.end(), [](TensorView* tv) {
+        return tv->uses().size() > 1;
+      })) {
+    return std::nullopt;
+  }
+
+  // Check if the ops match with the repeat pattern. Currently only
+  // one iter domain can be repeated
+  IterDomain* broadcast_id = nullptr;
+  int64_t broadcast_pos = -1;
+  for (const auto i : c10::irange(broadcast_out->getLogicalDomain().size())) {
+    if (broadcast->getBroadcastDimFlags().at(i)) {
+      if (broadcast_id != nullptr) {
+        // Multiple broadcast IDs not supported
+        return std::nullopt;
+      }
+      broadcast_id = broadcast_out->getLogicalDomain().at(i);
+      broadcast_pos = (int64_t)i;
+    }
+  }
+
+  if (broadcast_id == nullptr) {
+    return std::nullopt;
+  }
+
+  // Check if and only if the broadcast ID is expanded
+  IterDomain* expanded_id = nullptr;
+  for (const auto i : c10::irange(broadcast_out->getLogicalDomain().size())) {
+    auto p_id = broadcast_out->getLogicalDomain().at(i);
+    auto c_id = expand_out->getLogicalDomain().at(i);
+    if (p_id == broadcast_id && c_id->isBroadcast() &&
+        c_id->hasExpandedExtent()) {
+      expanded_id = c_id;
+    } else if (
+        p_id->isBroadcast() && !p_id->hasExpandedExtent() &&
+        c_id->isBroadcast() && c_id->hasExpandedExtent()) {
+      // Expanded but this broadcast was not introduced by the
+      // preceding broadcast op
+      return std::nullopt;
+    }
+  }
+
+  if (expanded_id == nullptr) {
+    return std::nullopt;
+  }
+
+  // Only a static repeat factor is considered
+  if (!expanded_id->expandedExtent()->isConstInt()) {
+    return std::nullopt;
+  }
+
+  // The expanded ID should be merged with the iter domain next to it,
+  // and that should be the only reshape expr
+  auto reshape_exprs = DependencyCheck::getAllExprsBetween(
+      {reshape_out->getRootDomain().begin(),
+       reshape_out->getRootDomain().end()},
+      {reshape_out->getLogicalDomain().begin(),
+       reshape_out->getLogicalDomain().end()});
+  if (reshape_exprs.size() != 1) {
+    return std::nullopt;
+  }
+
+  auto reshape_merge = dynamic_cast<Merge*>(reshape_exprs.at(0));
+  if (reshape_merge == nullptr) {
+    return std::nullopt;
+  }
+
+  // The corresponding root ID of the outout tv should be one of the
+  // inputs of the merge
+  auto reshape_root_broadcast = reshape_out->getRootDomain().at(broadcast_pos);
+  if (reshape_merge->outer() != reshape_root_broadcast &&
+      reshape_merge->inner() != reshape_root_broadcast) {
+    return std::nullopt;
+  }
+
+  // Reshape of an expanded broadcast always generates a concrete
+  // non-broadcast ID, so this check is not necessary, but just in
+  // case in the future that may change.
+  if (reshape_merge->out()->isBroadcast() ||
+      reshape_merge->out()->hasExpandedExtent()) {
+    return std::nullopt;
+  }
+
+  StaticRepeatInfo info;
+  info.repeat_output_tv = maybe_repeat_out;
+  info.reshape_output_tv = reshape_out;
+  info.reshape_repeat_id = reshape_out->getRootDomain().at(broadcast_pos);
+  info.repeat_tvs = repeat_tvs;
+
+  return info;
+}
+
+} // namespace scheduler_tools
+} // namespace nvfuser
diff --git a/csrc/scheduler/tools/static_repeat.h b/csrc/scheduler/tools/static_repeat.h
new file mode 100644
index 00000000000..bfdd8d4f346
--- /dev/null
+++ b/csrc/scheduler/tools/static_repeat.h
@@ -0,0 +1,80 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <optional>
+#include <unordered_set>
+
+namespace nvfuser {
+
+class IterDomain;
+class TensorView;
+
+namespace scheduler_tools {
+
+// torch.repeat can be represented as:
+//
+// t0: [i0, i1]
+// t1 = broadcast(t0) // [i0, b2, i1]
+// t2 = expand(t1, {-1, 2, -1}); // [i0, b2(2), i1]
+// t3 = reshape(t2, {i0, 2 * i1}); // [i0, 2*i1]
+//
+// It is especially important to recognize this pattern when it
+// appears at the end of a pointwise fusion segment, where an output
+// is used as the reference tensor of scheduling the segment. For
+// example, if a segment has the above pattern at the end of
+// the segment with t3 as the only output, the whole segment may be
+// scheduled based on t3. That is quite common in RoPE, where Q, K and
+// V tensors have different sizes but smaller tensors are commonly
+// repeated at the end of its computation.
+//
+// This can be problematic since the whole segment is scheduled based
+// on the repeated tensor whose size is largere than the rest of the
+// tensors by the repetition factor. For example, if the it is
+// repeated twice, we would launch threads and blocks that are
+// required for the twice-larger tensor but most of the actual
+// computations will actually only need half of them. In fact,
+// depending on actual scheduling strategies, they may be just
+// redundantly doing the same computations, which should be avoided if
+// possible.
+//
+// getMaybeStaticRepeatInfo analyzes a given tensor and its producers
+// to detect the above repeat pattern. The detected pattern is
+// currently only used by the resize scheduler. It effectively factors
+// out the repetition factor as an iter domain and moves it to the
+// outermost position. The remaining iter domains are scheduled and
+// propagated to the rest of the tensors.
+//
+// TODO: Consider generalizing this heuristics to the other
+// schedulers.
+
+struct StaticRepeatInfo {
+  // The final output tensor of the detected repeat pattern, e.g.,
+  // t3 in the above example case.
+  TensorView* repeat_output_tv = nullptr;
+  // The reshape output tensor, e.g., t3 in the above example case. It
+  // is not the same as repeat_output_tv when there's a cache.
+  TensorView* reshape_output_tv = nullptr;
+  // The ID of reshape output TV that corresponds to the
+  // expanded broadcast ID. In the above example case, this
+  // would be the root ID of t3 that corresponds to b2
+  IterDomain* reshape_repeat_id = nullptr;
+  // Output tensors of the detected broadcast, expand and reshape
+  // ops. In the above example case, this would consist of t1, t2 and t3.
+  std::unordered_set<TensorView*> repeat_tvs;
+};
+
+// Check if the given tensor matches with the final reshape output
+// tensor of the repetition pattern and return the relevant
+// information about the detected pattern. Only a static repeat case
+// is considered.
+std::optional<StaticRepeatInfo> getMaybeStaticRepeatInfo(
+    TensorView* maybe_repeat_out_tv);
+
+} // namespace scheduler_tools
+} // namespace nvfuser
diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
index cd22d935a52..4c4ea51fa5c 100644
--- a/csrc/scheduler/utils.cpp
+++ b/csrc/scheduler/utils.cpp
@@ -2666,6 +2666,79 @@ int64_t reorderDevicesToOuter(TensorView* tv) {
   return (int64_t)old2new.size();
 }
 
+void reorderTensorLike(
+    TensorView* target_tv,
+    const std::vector<IterDomain*>& ref) {
+  const auto& tv_loop_domain = target_tv->getLoopDomain();
+
+  IdModel id_model(target_tv->fusion(), /*build_graphs=*/false);
+  const auto& graph = id_model.buildBroadcastGraph();
+
+  ValGroups target_groups = graph.toGroups(tv_loop_domain);
+
+  ValGroups ref_groups = graph.toGroups(ref);
+
+  // Traverse from the reference to the target tv. The reference is
+  // not guaranteed to cover all loop IDs of target, so
+  // require_all_to_visited needs to be false
+  auto path = ValGraphBFS::getExprGroupsBetween(
+                  graph,
+                  ref_groups,
+                  target_groups,
+                  /*require_all_to_visited=*/false)
+                  .first;
+
+  // Traverse the expr path to create an ordered ID groups
+  std::deque<ValGroup> ordered_domain{
+      ref_groups.vector().begin(), ref_groups.vector().end()};
+
+  for (const auto& [expr_g, dir] : path) {
+    auto inputs = getInputsOfExpr(
+        expr_g, dir, ValGraphInputs(graph), ValGraphOutputs(graph));
+    auto outputs = getOutputsOfExpr(
+        expr_g, dir, ValGraphInputs(graph), ValGraphOutputs(graph));
+
+    // Inserts the outputs at the innermost position
+    auto innermost_it =
+        std::find(ordered_domain.begin(), ordered_domain.end(), inputs.back());
+    NVF_ERROR(innermost_it != ordered_domain.end());
+    ordered_domain.insert(innermost_it, outputs.begin(), outputs.end());
+
+    // Removes the inputs
+    for (const auto& inp : inputs) {
+      ordered_domain.erase(
+          std::remove(ordered_domain.begin(), ordered_domain.end(), inp),
+          ordered_domain.end());
+    }
+  }
+
+  std::unordered_map<int64_t, int64_t> old2new;
+
+  // Place IDs that do not appear in ref at the outer position
+  int64_t new_id_pos = 0;
+  for (const auto i : c10::irange(tv_loop_domain.size())) {
+    const auto& loop_id_group = graph.toGroup(tv_loop_domain.at(i));
+    auto it =
+        std::find(ordered_domain.begin(), ordered_domain.end(), loop_id_group);
+    if (it == ordered_domain.end()) {
+      old2new.emplace((int64_t)i, new_id_pos);
+      ++new_id_pos;
+    }
+  }
+  for (const auto i : c10::irange(tv_loop_domain.size())) {
+    const auto& loop_id_group = graph.toGroup(tv_loop_domain.at(i));
+    auto it =
+        std::find(ordered_domain.begin(), ordered_domain.end(), loop_id_group);
+    if (it != ordered_domain.end()) {
+      int64_t new_pos =
+          (int64_t)std::distance(ordered_domain.begin(), it) + new_id_pos;
+      old2new.emplace((int64_t)i, new_pos);
+    }
+  }
+
+  target_tv->reorder(old2new);
+}
+
 } // namespace scheduler_utils
 
 } // namespace nvfuser
diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
index 62a359816d2..96c929d74fe 100644
--- a/csrc/scheduler/utils.h
+++ b/csrc/scheduler/utils.h
@@ -745,5 +745,9 @@ inline int64_t nLogicalDims(const TensorView* tv) {
   return tv_n_dims;
 }
 
+// Reorer the loop domain of a given tensor to align with a given list of
+// reference IDs. Non-matching loop IDs are placed outermost positions.
+void reorderTensorLike(TensorView* tv, const std::vector<IterDomain*>& ref);
+
 } // namespace scheduler_utils
 } // namespace nvfuser
diff --git a/csrc/serde/fusion_record.cpp b/csrc/serde/fusion_record.cpp
index 7e23adf2b69..253270f0b9d 100644
--- a/csrc/serde/fusion_record.cpp
+++ b/csrc/serde/fusion_record.cpp
@@ -653,6 +653,11 @@ void RecordFunctorFactory::setupFunctionMaps() {
       ("ops." op_str), static_cast<TensorView* (*)(TensorView*)>(op_name)); \
   unary_val.emplace(("ops." op_str), static_cast<Val* (*)(Val*)>(op_name));
 
+#define NVFUSER_UNARY_TV_ALPHA_OP(op_str, op_name) \
+  binary_tv_val.emplace(                           \
+      ("ops." op_str),                             \
+      static_cast<TensorView* (*)(TensorView*, Val*)>(op_name));
+
 #define NVFUSER_BINARY_TV_ONLY_OP(op_str, op_name) \
   binary_tv.emplace(                               \
       ("ops." op_str),                             \
@@ -808,6 +813,8 @@ void RecordFunctorFactory::setupFunctionMaps() {
   NVFUSER_UNARY_TV_OP("real", real)
   NVFUSER_UNARY_TV_OP("imag", imag)
 
+  NVFUSER_UNARY_TV_ALPHA_OP("triu", triu)
+
   NVFUSER_BINARY_TV_ONLY_OP("matmul", matmul)
   NVFUSER_BINARY_TV_ONLY_OP("linear", linear)
   NVFUSER_TERNARY_TV_ONLY_OP("linear", linear)
diff --git a/csrc/type.cpp b/csrc/type.cpp
index ab087361a1d..36819404ae9 100644
--- a/csrc/type.cpp
+++ b/csrc/type.cpp
@@ -712,6 +712,8 @@ static const char* parallel_type2string(ParallelType t) {
       return "threadIdx.y";
     case ParallelType::TIDx:
       return "threadIdx.x";
+    case ParallelType::Stream:
+      return "Stream";
     case ParallelType::Vectorize:
       return "V";
     case ParallelType::MisalignedVectorize:
diff --git a/csrc/type.h b/csrc/type.h
index 89cebe8763b..265f1a939ee 100644
--- a/csrc/type.h
+++ b/csrc/type.h
@@ -672,6 +672,7 @@ enum class ParallelType {
   TIDz,
   TIDy,
   TIDx,
+  Stream,
   Vectorize,
   MisalignedVectorize,
   Unroll,
diff --git a/tests/cpp/test_circular_buffering.cpp b/tests/cpp/test_circular_buffering.cpp
index a5eb595af9c..d6372c8835c 100644
--- a/tests/cpp/test_circular_buffering.cpp
+++ b/tests/cpp/test_circular_buffering.cpp
@@ -12,9 +12,154 @@
 #include <tests/cpp/utils.h>
 #include <tests/cpp/validator.h>
 #include <exception>
+#include <utility>
 
 namespace nvfuser {
 
+TEST_F(NVFuserTest, RegisterSharingCircularBufferingPointwiseCustom) {
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
+  std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  int64_t number_of_stages = 4;
+  int64_t prefetch_distance = 1;
+  int64_t tensor_outer_dim = 128;
+  int64_t tensor_inner_dim = 128;
+  CircularBufferType circular_buffer_type =
+      WarpSpecialized(ParallelType::TIDy, std::make_pair(160L, 160L));
+
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = makeContigTensor(2);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+  fusion->addOutput(tv2);
+
+  // Use TMA to load TV0 into shared memory
+  TensorView* tv3 = tv0->cacheAfter(LoadStoreOpType::CpAsyncBulkTensorTile);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  TensorView* tv4 = tv1->cacheAfter(LoadStoreOpType::CpAsyncBulkTensorTile);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  TensorView* reference = tv2;
+
+  // Constants
+  constexpr int64_t bulk_inner_dim = 32;
+
+  // [M, N] -> [M, N/bid, bid]
+  reference->split(-1, bulk_inner_dim);
+
+  TransformPropagatorWithCheck propagator(reference);
+  MaxLogicalDomainInfoSpanningTree(reference).traverse(&propagator);
+
+  // Set computeAt position
+  inlineAllAt(tv2, /*pos=*/2);
+
+  // Circular Buffer with TMA loads
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::Bulk);
+  tv3->circularBuffer(
+      number_of_stages, prefetch_distance, circular_buffer_type);
+
+  // Load TV1 into shared memory
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::Bulk);
+  tv4->circularBuffer(
+      number_of_stages, prefetch_distance, circular_buffer_type);
+
+  // Split reference to parallelize TMA tile
+  reference->split(-1, 32);
+  reference->axis(0)->parallelize(ParallelType::BIDx);
+  reference->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
+  at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
+  at::Tensor t2 = t0 + t1;
+
+  KernelExecutor ke;
+  ke.compile(fusion.get(), {t0, t1});
+
+  std::vector<at::Tensor> cg_outputs = ke.run({t0, t1});
+  testValidate(fusion.get(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, RegisterSharingCircularBufferingPointwiseNested) {
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(9, 0, 10, 0);
+  std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  int64_t number_of_stages = 4;
+  int64_t prefetch_distance = 1;
+  int64_t tensor_outer_dim = 128;
+  int64_t tensor_inner_dim = 128;
+  CircularBufferType circular_buffer_type =
+      WarpSpecialized(ParallelType::TIDy, std::make_pair(160L, 160L));
+
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = makeContigTensor(2);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+  fusion->addOutput(tv2);
+
+  // Use TMA to load TV0 into shared memory
+  TensorView* tv3 = tv0->cacheAfter(LoadStoreOpType::CpAsyncBulkTensorTile);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  TensorView* tv4 = tv1->cacheAfter(LoadStoreOpType::CpAsyncBulkTensorTile);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  TensorView* reference = tv2;
+
+  // Constants
+  constexpr int64_t bulk_inner_dim = 32;
+
+  // [M, N] -> [M, N/bid, bid]
+  reference->split(-1, bulk_inner_dim);
+
+  TransformPropagatorWithCheck propagator(reference);
+  MaxLogicalDomainInfoSpanningTree(reference).traverse(&propagator);
+
+  // Set computeAt position
+  inlineAllAt(tv2, /*pos=*/2);
+
+  // Circular Buffer with TMA loads
+  // tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::Bulk);
+  tv3->circularBuffer(
+      number_of_stages, prefetch_distance, circular_buffer_type);
+
+  // Load TV1 into shared memory
+  // tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::Bulk);
+  tv4->circularBuffer(
+      number_of_stages, prefetch_distance, circular_buffer_type);
+
+  // Split reference to parallelize TMA tile
+  reference->split(-1, 32);
+  // reference->axis(0)->parallelize(ParallelType::BIDx);
+  reference->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
+  at::Tensor t1 = at::randn({tensor_outer_dim, tensor_inner_dim}, options);
+  at::Tensor t2 = t0 + t1;
+
+  KernelExecutor ke;
+  try {
+    ke.compile(fusion.get(), {t0, t1});
+  } catch (const std::exception& e) {
+    const char* reference =
+        R"(When using register sharing with warp-specialized circular buffering, the circular buffer loop must be the outer-most for-loop.)";
+    const char* str_match_pointer = strstr(e.what(), reference);
+    ASSERT_TRUE(str_match_pointer != nullptr);
+  }
+}
+
 using StageAndPrefetch = std::pair<int64_t, int64_t>;
 
 class CircularBufferingTest : public NVFuserFixtureParamTest<StageAndPrefetch> {
@@ -855,6 +1000,12 @@ class TmaCircularBufferingTest
     NVFuserTest::SetUp();
   }
 
+  bool testEnablesRegisterSharing() {
+    return std::holds_alternative<WarpSpecialized>(circular_buffer_type) &&
+        std::get<WarpSpecialized>(circular_buffer_type)
+            .num_registers.has_value();
+  }
+
   template <typename data_type>
   void compare(int64_t tensor_dim, at::Tensor result, at::Tensor reference) {
     at::Tensor reference_cpu_data = reference.cpu();
@@ -992,6 +1143,10 @@ TEST_F(NVFuserTest, ElectSyncCompatibility) {
 
 TEST_P(TmaCircularBufferingTest, SingleDim) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1042,6 +1197,10 @@ TEST_P(TmaCircularBufferingTest, SingleDim) {
 
 TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1103,6 +1262,10 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnroll) {
 
 TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1164,6 +1327,10 @@ TEST_P(TmaCircularBufferingTest, SingleDimUnswitch) {
 
 TEST_P(TmaCircularBufferingTest, MultiDim) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1228,6 +1395,10 @@ TEST_P(TmaCircularBufferingTest, MultiDim) {
 
 TEST_P(TmaCircularBufferingTest, Pointwise) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -1300,6 +1471,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {
       << "Needs shared memory predicate, but current needsSharedMemoryPredicate() returns false";
 
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -1365,6 +1540,10 @@ TEST_P(TmaCircularBufferingTest, PointwiseCpAsync) {
 
 TEST_P(TmaCircularBufferingTest, InnerReduction) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1426,6 +1605,10 @@ TEST_P(TmaCircularBufferingTest, InnerReduction) {
 
 TEST_P(TmaCircularBufferingTest, OuterReduction) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1479,6 +1662,10 @@ TEST_P(TmaCircularBufferingTest, OuterReduction) {
 
 TEST_P(TmaCircularBufferingTest, Persistent) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   constexpr at::ScalarType dtype = at::ScalarType::Float;
   constexpr int64_t correction = 0;
@@ -1612,6 +1799,10 @@ TEST_P(TmaCircularBufferingTest, Persistent) {
 
 TEST_P(TmaCircularBufferingTest, Matmul) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
 
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
@@ -1733,6 +1924,11 @@ TEST_P(TmaCircularBufferingTest, Matmul) {
 TEST_P(TmaCircularBufferingTest, MatmulWithBroadcastedInput) {
   NVFUSER_TEST_CUDA_ARCH_GUARD(9, 0);
 
+  if (testEnablesRegisterSharing() && deviceMajorMinorCheck(10)) {
+    GTEST_SKIP() << "Register Sharing is only for hopper";
+    return;
+  }
+
   std::unique_ptr<Fusion> fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -1855,7 +2051,9 @@ auto tmaCircularBufferingParams() {
       Pipelined(false),
       Pipelined(true),
       WarpSpecialized(ParallelType::TIDx),
-      WarpSpecialized(ParallelType::TIDy)};
+      WarpSpecialized(ParallelType::TIDy),
+      WarpSpecialized(ParallelType::TIDx, std::make_pair(40, 240)),
+      WarpSpecialized(ParallelType::TIDy, std::make_pair(40, 240))};
   std::vector<TmaCircularBufferingParams> values;
   for (int64_t i : {2, 4}) {
     for (int64_t j : c10::irange(-i, i)) {
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index 76d45f6de4c..c7eba2b1fed 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -9272,6 +9272,118 @@ TEST_F(NVFuserTest, AllIdsMultipleDependencies) {
   }
 }
 
+// Repeating a broadcast ID. RepeatOp should be used.
+TEST_F(NVFuserTest, RepeatBroadcast) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  auto tv0 = makeConcreteTensor({10});
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = repeat(tv1, {1L, 2L});
+  fusion.addOutput(tv2);
+
+  EXPECT_TRUE(tv2->definition()->isA<RepeatOp>());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({10}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
+// Repeating a non-broadcast ID. Should be translated to broadcast +
+// expand + reshape.
+TEST_F(NVFuserTest, RepeatNonBroadcast) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  auto tv0 = makeConcreteTensor({10});
+  fusion.addInput(tv0);
+
+  auto tv1 = repeat(tv0, {2L});
+  fusion.addOutput(tv1);
+
+  ASSERT_TRUE(tv1->definition()->isA<ViewOp>());
+  ASSERT_TRUE(tv1->definition()->input(0)->definition()->isA<ExpandOp>());
+  ASSERT_TRUE(tv1->definition()
+                  ->input(0)
+                  ->definition()
+                  ->input(0)
+                  ->definition()
+                  ->isA<BroadcastOp>());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({10}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
+// Repeating a mix of broadcast and non-broadcast IDs
+TEST_F(NVFuserTest, RepeatBroadcastAndNonBroadcast) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape{2, 1, 3, 1};
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = repeat(tv0, {2L, 2L, 2L, 2L});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, CastPrecision) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = castOp(DataType::BFloat16, tv1);
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeSymbolicTensor(2, DataType::Index);
+  fusion.addInput(tv3);
+
+  auto tv4 = castOp(DataType::Int, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv1_precision = ir_utils::getPrecisionOfProducerConsumerTensors(
+      tv1->definition()->as<UnaryOp>());
+  ASSERT_TRUE(tv1_precision.has_value());
+  EXPECT_EQ(tv1_precision->first, 2);
+  EXPECT_EQ(tv1_precision->second, 4);
+
+  auto tv2_precision = ir_utils::getPrecisionOfProducerConsumerTensors(
+      tv2->definition()->as<UnaryOp>());
+  ASSERT_TRUE(tv2_precision.has_value());
+  EXPECT_EQ(tv2_precision->first, 4);
+  EXPECT_EQ(tv2_precision->second, 2);
+
+  // Precision of type Index is not possible to determine until lowering
+  auto tv4_precision = ir_utils::getPrecisionOfProducerConsumerTensors(
+      tv4->definition()->as<UnaryOp>());
+  ASSERT_FALSE(tv4_precision.has_value());
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
new file mode 100644
index 00000000000..c98893c8623
--- /dev/null
+++ b/tests/cpp/test_host_ir_integration.cpp
@@ -0,0 +1,62 @@
+// clang-format off
+/*
+* SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+* All rights reserved.
+* SPDX-License-Identifier: BSD-3-Clause
+*/
+// clang-format on
+#include <fusion.h>
+#include <host_ir/container.h>
+#include <host_ir/executor.h>
+#include <ir/all_nodes.h>
+#include <ops/all_ops.h>
+#include <tests/cpp/utils.h>
+
+namespace nvfuser {
+
+namespace hir {
+
+using HostIrIntegrationTest = NVFuserTest;
+
+TEST_F(HostIrIntegrationTest, LaunchKernel) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* in = makeSymbolicTensor(2);
+  fusion.addInput(in);
+
+  TensorView* out = set(in);
+  fusion.addOutput(out);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({32, 32}, options);
+  std::vector<c10::IValue> aten_inputs = {t0};
+  auto ke = std::make_unique<KernelExecutor>();
+  ke->compile(&fusion, aten_inputs);
+
+  auto hic = std::make_unique<HostIrContainer>();
+  FusionGuard::setCurFusion(hic.get());
+
+  hic->pushBackKernelExecutor(std::move(ke));
+
+  IrCloner ir_cloner(hic.get());
+  auto hic_in = ir_cloner.clone(in);
+  auto hic_out = ir_cloner.clone(out);
+
+  hic->addInput(hic_in);
+  hic->addOutput(hic_out);
+
+  auto launch_kernel = IrBuilder::create<LaunchKernel>(
+      0, std::vector<Val*>{hic_in}, std::vector<Val*>{hic_out});
+
+  hic->pushBackTopLevelExprs(launch_kernel);
+
+  HostIrEvaluator hie(std::move(hic));
+
+  auto outputs = hie.runWithInput({{hic_in, t0}});
+
+  EXPECT_TRUE(outputs[0].equal(t0));
+}
+
+} // namespace hir
+
+} // namespace nvfuser
diff --git a/tests/cpp/test_id_model.cpp b/tests/cpp/test_id_model.cpp
index 4acead00286..246e677db35 100644
--- a/tests/cpp/test_id_model.cpp
+++ b/tests/cpp/test_id_model.cpp
@@ -135,7 +135,7 @@ class IdModelTester : public LoopPromotionMapBuilderCallback {
         /*loop_promotion_map_builder_callback=*/this);
 
     // Only build the loop graph
-    id_model->buildLoopGraph();
+    id_model->buildLoopGraph(/*force_full_loop_promotion_analysis=*/true);
   }
 
   void postStep1(
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
index 3691babd5b0..641957407d5 100644
--- a/tests/cpp/test_indexing.cpp
+++ b/tests/cpp/test_indexing.cpp
@@ -5629,4 +5629,41 @@ TEST_F(IndexingTest, AlmostExactIndexingUpdate) {
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
 
+// Small repro of
+// https://github.com/NVIDIA/Fuser/issues/3688. Broadcast logical
+// IDs may not be reachable from loop IDs, thus the indexing for the
+// logical IDs of the pad output failed.
+TEST_F(IndexingTest, BroadcastLogicalDomainIndexing) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape1{1, 32};
+  std::vector<int64_t> shape2{8, 34};
+
+  auto tv0 = makeConcreteTensor(shape1);
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor(shape2);
+  fusion.addInput(tv1);
+
+  auto tv2 = pad(tv0, {fusion.oneVal(), fusion.oneVal()});
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv2->inlineAt(-1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t1 = at::randn(shape2, options);
+  std::vector<c10::IValue> inputs{t0, t1};
+
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_loop_domain_scheduling.cpp b/tests/cpp/test_loop_domain_scheduling.cpp
index 32492821a89..0ba0efc5283 100644
--- a/tests/cpp/test_loop_domain_scheduling.cpp
+++ b/tests/cpp/test_loop_domain_scheduling.cpp
@@ -542,4 +542,270 @@ TEST_F(LoopDomainSchedulingTest, BroadcastRefereceIDs) {
   }
 }
 
+// Cancelling a reshape to make all tensors ordered as the input
+TEST_F(LoopDomainSchedulingTest, CancelReshape1) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{16, 32, 2};
+
+  auto tv0 = makeContigConcreteTensor(shape); // [i0, i1, i2]
+  fusion.addInput(tv0);
+  auto tv1 = permute(tv0, {1, 0, 2}); // [i1, i0, i2]
+  auto tv2 =
+      reshape(tv1, shape, {shape[1], shape[0] * shape[2]}); // [i1, i0*i2]
+  auto tv3 = sin(tv2);
+  fusion.addOutput(tv3);
+
+  // Cancel the reshape of tv2
+  scheduler_tools::cancelReshapeInLoopDomains(tv0);
+
+  // The loop domain of tv2 should now be the same as its root domain.
+  EXPECT_EQ(tv2->getRootDomain(), tv2->getLoopDomain());
+  // The loop domain of tv3 should be exact mapped with the tv2 loop
+  // domain
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    EXPECT_EQ(
+        exact_graph.toGroups(tv3->getLoopDomain()),
+        exact_graph.toGroups(tv2->getLoopDomain()));
+  }
+
+  // Reorder tv3 as the input
+  tv3->reorder({1, 0, 2});
+  tv3->flatten();
+  tv3->split(0, 128);
+  scheduler_tools::scheduleLoopDomainsLike({tv1, tv2}, tv3->getLoopDomain());
+
+  // All loop domains should be exact mapped
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    const auto ref_loop = exact_graph.toGroups(tv3->getLoopDomain());
+    for (auto tv : {tv1, tv2}) {
+      EXPECT_EQ(exact_graph.toGroups(tv->getLoopDomain()), ref_loop);
+    }
+  }
+
+  inlineMost();
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
+// Cancelling chained reshape ops
+TEST_F(LoopDomainSchedulingTest, CancelReshape2) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{10, 11, 12};
+
+  auto tv0 = makeContigConcreteTensor(shape); // [i0, i1, i2]
+  fusion.addInput(tv0);
+  auto tv1 = reshape(
+      tv0,
+      {IrBuilder::create<Val>(shape[1]),
+       IrBuilder::create<Val>(shape[0] * shape[2])});
+  auto tv2 = reshape(
+      tv1,
+      {IrBuilder::create<Val>(shape[1]),
+       IrBuilder::create<Val>(shape[2]),
+       IrBuilder::create<Val>(shape[0])});
+  auto tv3 = reshape(
+      tv2,
+      {IrBuilder::create<Val>(shape[0] * shape[1]),
+       IrBuilder::create<Val>(shape[2])});
+  fusion.addOutput(tv3);
+
+  // Cancel all reshape ops
+  scheduler_tools::cancelReshapeInLoopDomains(tv0);
+
+  // All of the tensors should have the same loop domain
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    const auto ref_loop = exact_graph.toGroups(tv0->getLoopDomain());
+    for (auto tv : {tv1, tv2, tv3}) {
+      EXPECT_EQ(exact_graph.toGroups(tv->getLoopDomain()), ref_loop);
+    }
+  }
+
+  tv3->flatten();
+  tv3->split(0, 32);
+
+  inlineMost();
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
+// Two reshapes that get merged by a binary op
+TEST_F(LoopDomainSchedulingTest, CancelReshape3) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{10, 11};
+
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+  auto tv1 = reshape(tv0, {IrBuilder::create<Val>(-1L)});
+  auto tv2 = reshape(tv0, {IrBuilder::create<Val>(-1L)});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  // The cancellation of the second reshape won't do anything as the
+  // loop domain is already updated by the first reshape.
+  scheduler_tools::cancelReshapeInLoopDomains(tv0);
+
+  // All of the tensors should have the same loop domain
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    const auto ref_loop = exact_graph.toGroups(tv0->getLoopDomain());
+    for (auto tv : {tv1, tv2, tv3}) {
+      EXPECT_EQ(exact_graph.toGroups(tv->getLoopDomain()), ref_loop);
+    }
+  }
+
+  inlineMost();
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  auto outputs = ke.run(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
+
+// Resize should prevent cancellation
+TEST_F(LoopDomainSchedulingTest, CancelReshape4) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{10, 11, 12};
+
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // Non-cancellable reshape due to the following slice
+  auto tv1 = reshape(
+      tv0, {IrBuilder::create<Val>(shape[0]), IrBuilder::create<Val>(-1L)});
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->axis(0)->extent()},
+       {fusion.oneVal(), tv1->axis(1)->extent()}});
+  fusion.addOutput(tv2);
+
+  // Cancellable reshape
+  auto tv3 = reshape(
+      tv0,
+      {IrBuilder::create<Val>(shape[0] * shape[1]),
+       IrBuilder::create<Val>(-1L)});
+  auto tv4 = slice(
+      tv3,
+      {{fusion.zeroVal(), tv3->axis(0)->extent()},
+       {fusion.oneVal(), tv3->axis(1)->extent()}});
+  fusion.addOutput(tv4);
+
+  const auto tv1_original_loop = tv1->getLoopDomain();
+  const auto tv2_original_loop = tv2->getLoopDomain();
+
+  // tv1 and tv2 should not be modified as the slice depends on the reshaped
+  // domain
+  scheduler_tools::cancelReshapeInLoopDomains(tv0);
+
+  EXPECT_EQ(tv1->getLoopDomain(), tv1_original_loop);
+  EXPECT_EQ(tv2->getLoopDomain(), tv2_original_loop);
+
+  // The tv3 reshape should be cancelled as the slice does not
+  // depend on the reshape expr
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    ValGroups ref_loop;
+    for (const auto i : c10::irange(2)) {
+      ref_loop.pushBack(exact_graph.toGroup(tv0->getLoopDomain().at(i)));
+    }
+    // The first two loop IDs should be exact mapped with tv0
+    for (auto tv : {tv3, tv4}) {
+      ASSERT_EQ(tv->getLoopDomain().size(), 3);
+      ValGroups tv_loop_groups;
+      for (const auto i : c10::irange(2)) {
+        tv_loop_groups.pushBack(exact_graph.toGroup(tv->getLoopDomain().at(i)));
+      }
+      EXPECT_EQ(tv_loop_groups, ref_loop);
+    }
+  }
+}
+
+// Reduction should prevent cancellation
+TEST_F(LoopDomainSchedulingTest, CancelReshape5) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{10, 11, 12};
+
+  auto tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // Non-cancellable reshape due to the following reduction
+  auto tv1 = reshape(
+      tv0, {IrBuilder::create<Val>(shape[0]), IrBuilder::create<Val>(-1L)});
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+
+  // Cancellable reshape
+  auto tv3 = reshape(
+      tv0,
+      {IrBuilder::create<Val>(shape[0] * shape[1]),
+       IrBuilder::create<Val>(-1L)});
+  auto tv4 = sum(tv3, {1});
+  fusion.addOutput(tv4);
+
+  const auto tv1_original_loop = tv1->getLoopDomain();
+  const auto tv2_original_loop = tv2->getLoopDomain();
+
+  // tv1 and tv2 should not be modified as the tv2 reduction depends on the
+  // reshaped domain
+  scheduler_tools::cancelReshapeInLoopDomains(tv0);
+
+  EXPECT_EQ(tv1->getLoopDomain(), tv1_original_loop);
+  EXPECT_EQ(tv2->getLoopDomain(), tv2_original_loop);
+
+  // The tv3 reshape should be cancelled as the reduction does not
+  // depend on the reshape expr
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    const auto ref_loop = exact_graph.toGroups(tv0->getLoopDomain());
+    for (auto tv : {tv3, tv4}) {
+      EXPECT_EQ(exact_graph.toGroups(tv->getLoopDomain()), ref_loop);
+    }
+  }
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/test_matmul.cpp b/tests/cpp/test_matmul.cpp
index cd4aa467460..38d4704b954 100644
--- a/tests/cpp/test_matmul.cpp
+++ b/tests/cpp/test_matmul.cpp
@@ -4371,6 +4371,90 @@ TEST_F(HopperMatmulTest, MLPBenchmarkFwdEpilogueFusion) {
   EXPECT_TRUE(cg_outputs[1].allclose(tv11_ref, 1e-2, 1e-2));
 }
 
+TEST_F(HopperMatmulTest, MLPBenchmarkFwdHorizontalFusion) {
+  EnableOptionsGuard eog;
+  EnableOptionsGuard::getCurOptions().set(EnableOption::FuseMultipleMatmuls);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int64_t M = 4096, N = 14336, K = 5120;
+  const auto dtype = DataType::BFloat16;
+
+  auto tv0 = makeContigConcreteTensor({-1, -1}, dtype); // M, K
+  auto tv1 = makeContigConcreteTensor({-1, -1}, dtype); // N, K
+  auto tv2 = makeContigConcreteTensor({-1, -1}, dtype); // N, K
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  auto tv3 = linear(tv0, tv1);
+  fusion.addOutput(tv3);
+
+  auto tv4 = castOp(DataType::Float, tv3);
+  auto tv5 = neg(tv4);
+  auto tv6 = exp(tv5);
+  auto tv7 = add(fusion.oneVal(DataType::Float), tv6);
+  auto tv8 = reciprocal(tv7);
+  auto tv9 = mul(tv4, tv8);
+
+  auto tv10 = linear(tv0, tv2);
+  fusion.addOutput(tv10);
+
+  auto tv11 = mul(tv9, tv10);
+  auto tv12 = castOp(DataType::BFloat16, tv11);
+  fusion.addOutput(tv12);
+
+  auto options = at::TensorOptions().dtype(at::kBFloat16).device(at::kCUDA);
+  auto a_ref = at::randn({M, K}, options);
+  auto b_ref = at::randn({N, K}, options);
+  auto c_ref = at::randn({N, K}, options);
+
+  auto tv3_ref = at::linear(a_ref, b_ref);
+  auto tv4_ref = tv3_ref.to(at::kFloat);
+  auto tv10_ref = at::linear(a_ref, c_ref);
+  auto tv12_ref =
+      (tv4_ref * (1. / (1.0 + at::exp(-tv4_ref))) * tv10_ref.to(at::kFloat))
+          .to(at::kBFloat16);
+
+  MatmulParams mparams;
+  mparams.supported_vec_size = {8, 8, 8};
+  mparams.mma_macro = MmaMacro::Hopper_64_64_16;
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = GemmTile(128, 128, 16);
+  gemm_tile.warp_tile = GemmTile(64, 64, 16);
+  mparams.tile_sizes = gemm_tile;
+  mparams.cta_order = MatmulParams::TileRasterizationOrder::ColumnMajor;
+  mparams.async_gmem_load_operands = true;
+  mparams.circular_buffer_options.circular_buffer_smem_write = true;
+  mparams.circular_buffer_options.circular_buffer_smem_read = true;
+  mparams.circular_buffer_options.smem_circular_buffer_stage = 2;
+  mparams.circular_buffer_options.smem_circular_buffer_prefetch_gap = 1;
+  mparams.splitk_factor = 1;
+  mparams.use_smem_epilogue = true;
+  mparams.cluster_dims = {2, 1, 1};
+  mparams.promote_prologue_smem_reuse = true;
+
+  SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
+      ->schedule(&fusion, &mparams);
+
+  std::vector<c10::IValue> inputs = {a_ref, b_ref, c_ref};
+
+  KernelExecutor ke;
+  ke.compile(&fusion, inputs);
+  EXPECT_TRUE(getBankConflictInfo(ke.kernel()).empty());
+  auto cg_outputs = ke.run(inputs);
+  ASSERT_FALSE(
+      PredicatedChecker::isCpAsyncMmaPredicatedByIfThenElse(ke.kernel()));
+
+  // Relax tolerance for larger sum due to large K
+  // TODO: Some of these are failing, perhaps due to improper syncing of
+  // horizontally fused kernels?
+  // EXPECT_TRUE(cg_outputs[0].allclose(tv3_ref, 1e-6 * K, 1e-6 * K));
+  EXPECT_TRUE(cg_outputs[1].allclose(tv10_ref, 1e-6 * K, 1e-6 * K));
+  // EXPECT_TRUE(cg_outputs[2].allclose(tv12_ref, 1e-2, 1e-1));
+}
+
 // This tests that we can use a small instruction tile with a medium size
 // warpgroup tile and a large CTA tile.
 TEST_F(HopperMatmulTest, HSH_NT_UseScheduler_MultipleInstructionsPerWarpTile) {
diff --git a/tests/cpp/test_multidevice_host_ir.cpp b/tests/cpp/test_multidevice_host_ir.cpp
index 27e9477aede..ef05c4a45ac 100644
--- a/tests/cpp/test_multidevice_host_ir.cpp
+++ b/tests/cpp/test_multidevice_host_ir.cpp
@@ -5,6 +5,7 @@
 * SPDX-License-Identifier: BSD-3-Clause
 */
 // clang-format on
+#include <cuda_profiler_api.h>
 #include <fusion.h>
 #include <host_ir/container.h>
 #include <host_ir/executor.h>
@@ -349,6 +350,64 @@ TEST_F(P2PCommHostIrTest, CoalescedRingPairwiseExchange) {
   EXPECT_TRUE(torch::allclose(ref_output, outputs.back()));
 }
 
+using OverlapDistributedMatmulTest = MultiDeviceTest;
+
+TEST_F(OverlapDistributedMatmulTest, AG_matmul) {
+  constexpr int64_t M = 32768;
+  constexpr int64_t K = 32768;
+  constexpr int64_t N = 1024;
+  constexpr int64_t S = 8;
+  const int64_t D = communicator_->size();
+  if (M % (D * S) != 0) {
+    GTEST_SKIP() << "M must be a multiple of D * S, but got M = " << M
+                 << ", D = " << D << ", S = " << S;
+  }
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* a = makeContigTensor(4); //[S, DIDx(D), M/(S*D), K]
+  TensorView* b = makeContigTensor(2); //[K, N]
+  TensorView* c = matmul(a, b); //[S, D, M/(S*D), N]
+
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+
+  auto mesh = DeviceMesh::createForNumDevices(D);
+  a->setDeviceMesh(mesh);
+  b->setDeviceMesh(mesh);
+  c->setDeviceMesh(mesh);
+
+  a->axis(1)->parallelize(ParallelType::DIDx);
+  c->axis(0)->parallelize(ParallelType::Stream);
+
+  MultiDeviceExecutor executor(std::move(fusion), *communicator_);
+
+  auto tensor_options =
+      at::TensorOptions().dtype(at::kFloat).device(communicator_->device());
+  at::Tensor ta_unsharded = at::randn({S, D, M / (S * D), K}, tensor_options);
+  at::Tensor ta = ta_unsharded.slice(
+      1, communicator_->deviceId(), communicator_->deviceId() + 1);
+  at::Tensor tb = at::randn({K, N}, tensor_options);
+  at::Tensor tc_ref = at::matmul(ta_unsharded, tb);
+
+  std::vector<c10::IValue> inputs = {ta, tb};
+  at::Tensor tc;
+
+  constexpr int64_t kNumberOfIterations = 20;
+  constexpr int64_t kNumberOfWarmupIterations = 5;
+  for (auto i : c10::irange(kNumberOfIterations)) {
+    if (i == kNumberOfWarmupIterations) {
+      cudaProfilerStart();
+    }
+    tc = executor.runWithInput(inputs).at(0);
+  }
+  cudaProfilerStop();
+
+  EXPECT_TRUE(torch::allclose(tc_ref, tc, 1e-2, 1e-2));
+}
+
 } // namespace hir
 
 } // namespace nvfuser
diff --git a/tests/cpp/test_multidevice_matmul.cpp b/tests/cpp/test_multidevice_matmul.cpp
index a4e323a553d..cbc920bb01f 100644
--- a/tests/cpp/test_multidevice_matmul.cpp
+++ b/tests/cpp/test_multidevice_matmul.cpp
@@ -415,8 +415,11 @@ TEST_F(DistributedMatmulTest, AnnotateWeightOnly) {
   // x is of shape [2, 3] and replicated.
   // w is of shape [3, D*5] and column-wise sharded.
   // y is expected to have shape [2, D*5] and to be also column-wise sharded.
-  auto x_tensor = at::randn({2, 3}, tensor_options);
-  auto w_tensor = at::randn({mesh.size(), 3, 5}, tensor_options);
+  constexpr int64_t kLowerBound = 0;
+  constexpr int64_t kUpperBound = 10;
+  auto x_tensor = at::randint(kLowerBound, kUpperBound, {2, 3}, tensor_options);
+  auto w_tensor = at::randint(
+      kLowerBound, kUpperBound, {mesh.size(), 3, 5}, tensor_options);
   auto sharded_w_tensor = shardTensor(w_tensor, w);
 
   FusionExecutorCache executor_cache(std::move(fusion));
diff --git a/tests/cpp/test_preseg_passes.cpp b/tests/cpp/test_preseg_passes.cpp
index 4661d6e5599..f3462108496 100644
--- a/tests/cpp/test_preseg_passes.cpp
+++ b/tests/cpp/test_preseg_passes.cpp
@@ -26,6 +26,8 @@
 
 namespace nvfuser::preseg_passes {
 
+using testing::ElementsAre;
+
 using PresegTest = NVFuserTest;
 
 TEST_F(PresegTest, FusionTestOptimizationPassFlag) {
@@ -982,4 +984,51 @@ TEST_F(PresegTest, TranslateRepeatToExpand5) {
   EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::PointWise);
 }
 
+// Repeating a broadcast ID. Repro of
+// https://github.com/NVIDIA/Fuser/issues/3682.
+TEST_F(PresegTest, TranslateRepeatToExpand6) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({32, 1});
+  fusion.addInput(tv0);
+
+  auto tv1 = cat({tv0, tv0}, -1);
+  fusion.addOutput(tv1);
+
+  {
+    // Make sure pad and cat no longer exist
+    Fusion fusion_copy = fusion;
+    OptimizationPass<TranslateRepeatToExpand>::runPass(&fusion_copy);
+    auto new_exprs = fusion_copy.exprs();
+    EXPECT_EQ(
+        std::find_if(
+            new_exprs.begin(),
+            new_exprs.end(),
+            [](Expr* new_expr) { return new_expr->isOneOf<PadOp, CatOp>(); }),
+        new_exprs.end());
+    // RepeatOp should be used
+    EXPECT_NE(
+        std::find_if(
+            new_exprs.begin(),
+            new_exprs.end(),
+            [](Expr* new_expr) { return new_expr->isA<RepeatOp>(); }),
+        new_exprs.end());
+  }
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto t0 = at::randn({32, 1}, options);
+  std::vector<c10::IValue> inputs = {t0};
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
+
+  // Should be scheduled as a pointwise kernel
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  EXPECT_THAT(
+      runtime->fusionSegments()->groups(),
+      ElementsAre(HeuristicIs(SchedulerType::PointWise)));
+}
+
 } // namespace nvfuser::preseg_passes
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index 587f72143a4..970c4066043 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -4991,7 +4991,13 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
   Fusion& fusion = *fusion_ptr;
   FusionGuard fg(fusion_ptr.get());
 
-  std::vector<int64_t> shape({-1, 100});
+  // Due to #3640, the vectorization analysis may return 4 for this
+  // fusion since there's the use of the input without
+  // slicing. However, the correct factor needs to consider the
+  // slicing paths as well. For now, in order to avoid the error due
+  // to issue #3640, use a size that is divisible by 8.
+  // std::vector<int64_t> shape({16, 100});
+  std::vector<int64_t> shape({16, 96});
 
   EnableOptionsGuard enable_options_guard;
   EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
@@ -5021,7 +5027,7 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
   fusion.addOutput(tv6);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({16, 100}, options);
+  auto t0 = at::randn(shape, options);
   std::vector<c10::IValue> inputs({t0});
 
   const bool use_scheduler = GetParam();
diff --git a/tests/cpp/test_rope.cpp b/tests/cpp/test_rope.cpp
index ddec92d58e9..46ca54cc53b 100644
--- a/tests/cpp/test_rope.cpp
+++ b/tests/cpp/test_rope.cpp
@@ -894,4 +894,81 @@ TEST_P(LitgptRopeTest, Fwd) {
   testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
 }
 
+// Testing the scheduling of an ending repeat pattern, which is
+// commonly seen in RoPE.
+TEST_F(RopeTest, EndingRepeat) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  FusionGuard fg(fusion_ptr.get());
+  Fusion& fusion = *fusion_ptr;
+
+  std::vector<int64_t> shape1{8, 126};
+
+  auto tv0 = makeContigConcreteTensor(shape1);
+  fusion.addInput(tv0);
+
+  auto tv1 = pad(tv0, {fusion.oneVal(), fusion.oneVal()});
+  auto tv2 = repeat(tv1, {2, 1});
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  EXPECT_FALSE(runtime->isSegmented());
+  const auto& heuristic_param =
+      runtime->schedulerHeuristics()->heuristicsList().front();
+  EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
+  Fusion* scheduled_fusion =
+      dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())->fusion();
+
+  // Check the loop domain of the reference. It should look like:
+  //
+  // T4_g_float[iS19{2 ex 2}, iblockIdx.x22{8}, ithreadIdx.x23{128}] ca_pos( 3 )
+  // produce_pos( 3 )
+  //  logical domain : (iS17{( 2 * 8 )}, iS18{128})
+  //  contiguity: t t
+  //   Merge: iS20{8} and iS18{128} -> iS21{1024}
+  //   Split: iS21{1024} by factor 128 -> iblockIdx.x22{8}, ithreadIdx.x23{128}
+  //  loop domain : (iS19{2 ex 2}, iblockIdx.x22{8}, ithreadIdx.x23{128})
+  //
+  // iS19 is the repeat ID, which should be just a Serial ID with an
+  // extent of 2.
+  auto ref_tv = scheduled_fusion->outputs().at(0)->as<TensorView>();
+  // The outermost loop ID should be a Serial ID with an extent of 2.
+  EXPECT_EQ(
+      ref_tv->getLoopDomain().at(0)->getParallelType(), ParallelType::Serial);
+  EXPECT_TRUE(ref_tv->getLoopDomain().at(0)->extent()->isConstInt());
+  EXPECT_EQ(
+      ref_tv->getLoopDomain().at(0)->extent()->evaluate().as<int64_t>(), 2L);
+
+  IdModel id_model(scheduled_fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  const auto ref_loop = exact_graph.toGroups(ref_tv->getLoopDomain());
+
+  // The other tensors, except for the pad output, should be fully inlined into
+  // the reference tensor.
+  for (auto tv : scheduled_fusion->allTvs()) {
+    if (tv->isFusionInput()) {
+      continue;
+    }
+    auto tv_loop = exact_graph.toGroups(tv->getLoopDomain());
+    if (tv->definition() != nullptr && tv->definition()->isA<PadOp>()) {
+      ValGroups ref_groups{ref_loop.begin() + 1, ref_loop.end()};
+      // In the case of pad, the loop domain of the output tensor
+      // should be mapped with the loop domain of the reference
+      // without the outermost ID.
+      EXPECT_EQ(tv_loop, ref_groups);
+    } else {
+      EXPECT_EQ(tv_loop, ref_loop);
+      EXPECT_EQ(tv->getLoopDomain().size(), tv->getComputeAtPosition());
+    }
+  }
+}
+
 } // namespace nvfuser
diff --git a/tests/python/mpi_fixtures.py b/tests/python/multidevice_fixtures.py
similarity index 67%
rename from tests/python/mpi_fixtures.py
rename to tests/python/multidevice_fixtures.py
index 2193b29a0b9..76b295a721b 100644
--- a/tests/python/mpi_fixtures.py
+++ b/tests/python/multidevice_fixtures.py
@@ -2,42 +2,38 @@
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 
-import os
+import nvfuser
 import pytest
 import torch
-import nvfuser
-
-from mpi4py import MPI
 
 
-class MpiTest:
+class MultideviceTest:
     def __init__(self):
-        self._communicator = MPI.COMM_WORLD
-        self._local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"])
-        self._local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
+        self._communicator = nvfuser.Communicator.instance()
 
         # This way, when individual tests create unsharded input, each rank
         # receives the same data.
         torch.manual_seed(0)
 
+    @property
+    def communicator(self):
+        return self._communicator
+
     @property
     def size(self):
-        return self._communicator.size
+        return self._communicator.size()
 
     @property
     def rank(self):
-        return self._communicator.rank
+        return self._communicator.rank()
 
     @property
     def local_size(self):
-        return self._local_size
+        return self._communicator.local_size()
 
     @property
     def local_rank(self):
-        return self._local_rank
-
-    def barrier(self):
-        self._communicator.barrier()
+        return self._communicator.local_rank()
 
     def shard_tensor(
         self, t: torch.Tensor, dim: int, mesh: nvfuser.DeviceMesh
@@ -51,8 +47,8 @@ def shard_tensor(
 
 
 @pytest.fixture(scope="session")
-def mpi_test():
-    fixture = MpiTest()
+def multidevice_test():
+    fixture = MultideviceTest()
     yield fixture
     # Sync all ranks after each test for isolation.
-    fixture.barrier()
+    fixture.communicator.barrier()
diff --git a/tests/python/opinfo_fusion_definitions.py b/tests/python/opinfo_fusion_definitions.py
index 95abad9b7f4..768aa3a2953 100644
--- a/tests/python/opinfo_fusion_definitions.py
+++ b/tests/python/opinfo_fusion_definitions.py
@@ -28,7 +28,7 @@ def parse_inputs_fusion_definition(fd: FusionDefinition, opinfo: OpInfo, *args):
     )
 
     num_symbolic_parameters = len(symbolic_parameter_list)
-    assert num_symbolic_parameters == len(
+    assert num_symbolic_parameters >= len(
         args
     ), f"{num_symbolic_parameters} vs {len(args)}"
 
diff --git a/tests/python/opinfo_input_generators.py b/tests/python/opinfo_input_generators.py
index d3222aea4b4..472d5109059 100644
--- a/tests/python/opinfo_input_generators.py
+++ b/tests/python/opinfo_input_generators.py
@@ -1591,3 +1591,39 @@ def div_input_generator(
         denom = torch.where(denom_is_small, denom_scaled_to_minabs, denom).detach()
         denom.requires_grad_(requires_grad)
         yield SampleInput(numer, denom)
+
+
+def triu_input_generator(op: OpInfo, dtype: torch.dtype, requires_grad: bool = False):
+    offsets = (0, 1, -1, 2, 3, -3, 1024, -1024)
+
+    for element in elementwise_unary_generator(
+        op,
+        dtype,
+        requires_grad,
+        enable_extremal_value_testing=False,
+        enable_large_value_testing=False,
+        enable_small_value_testing=False,
+    ):
+        if element.args[0].ndim < 2:
+            continue
+        # to test cases where offset is not passed as an argument
+        yield element
+        # to test cases where offset is passed as an argument
+        for offset in offsets:
+            yield SampleInput(*element.args, offset)
+
+
+def triu_error_generator(op: OpInfo, dtype: torch.dtype, requires_grad: bool = False):
+    make_arg = partial(
+        make_tensor, device="cuda", dtype=dtype, requires_grad=requires_grad
+    )
+
+    invalid_shapes = (
+        (),
+        (4,),
+    )
+
+    for shape in invalid_shapes:
+        yield SampleInput(
+            make_arg(shape),
+        ), RuntimeError, f"input tensor for triu must have 2 or more dims, but got {len(shape)} dims"
diff --git a/tests/python/opinfos.py b/tests/python/opinfos.py
index 9031a9bd091..f0bbd649b87 100644
--- a/tests/python/opinfos.py
+++ b/tests/python/opinfos.py
@@ -50,6 +50,8 @@
     matmul_input_generator,
     linear_input_generator,
     linear_error_generator,
+    triu_input_generator,
+    triu_error_generator,
 )
 from utils import (
     bool_int_dtypes,
@@ -1218,6 +1220,19 @@ def torch_reshape_sym_fn(input_tensor, output_shaped_tensor):
 )
 linear_ops.append(linear_opinfo)
 
+tv_val_ops = []
+
+triu_opinfo = OpInfo(
+    lambda fd: fd.ops.triu,
+    "triu",
+    sample_input_generator=triu_input_generator,
+    error_input_generator=triu_error_generator,
+    reference=torch.triu,
+    symbolic_parameter_list=[ArgumentType.Symbolic, ArgumentType.Constant],
+)
+
+tv_val_ops.append(triu_opinfo)
+
 """ End Tensor Creation """
 
 # Puts all opinfos into the "opinfos" list
@@ -1231,3 +1246,4 @@ def torch_reshape_sym_fn(input_tensor, output_shaped_tensor):
 opinfos.extend(tensor_creation_ops)
 opinfos.extend(matmul_ops)
 opinfos.extend(linear_ops)
+opinfos.extend(tv_val_ops)
diff --git a/tests/python/test_communication.py b/tests/python/test_communication.py
index 75a94f6cec4..650c0e05994 100644
--- a/tests/python/test_communication.py
+++ b/tests/python/test_communication.py
@@ -5,17 +5,17 @@
 import pytest
 import torch
 
-import mpi_fixtures
+import multidevice_fixtures
 import nvfuser
 from nvfuser import DataType, FusionDefinition
 
 
-mpi_test = mpi_fixtures.mpi_test
+multidevice_test = multidevice_fixtures.multidevice_test
 
 
 @pytest.mark.mpi
-def test_allgather(mpi_test):
-    d = mpi_test.size
+def test_allgather(multidevice_test):
+    d = multidevice_test.size
     mesh = nvfuser.DeviceMesh(range(d))
 
     class Model(FusionDefinition):
@@ -38,7 +38,7 @@ def multidevice_schedule(self):
             self.sched.set_allocation_as_loop(self.out)
 
     unsharded = torch.randn(d * 4)
-    sharded = mpi_test.shard_tensor(unsharded, 0, mesh)
+    sharded = multidevice_test.shard_tensor(unsharded, 0, mesh)
 
     fd = Model()
     outputs = fd.execute([sharded])
@@ -46,8 +46,8 @@ def multidevice_schedule(self):
 
 
 @pytest.mark.mpi
-def test_allreduce(mpi_test):
-    d = mpi_test.size
+def test_allreduce(multidevice_test):
+    d = multidevice_test.size
     mesh = nvfuser.DeviceMesh(range(d))
 
     class Model(FusionDefinition):
@@ -63,7 +63,7 @@ def multidevice_schedule(self):
             self.sched.parallelize(self.inp, 0, nvfuser.ParallelType.mesh_x)
 
     unsharded = torch.randn(d, 4)
-    sharded = mpi_test.shard_tensor(unsharded, 0, mesh)
+    sharded = multidevice_test.shard_tensor(unsharded, 0, mesh)
 
     fd = Model()
     outputs = fd.execute([sharded])
@@ -71,8 +71,8 @@ def multidevice_schedule(self):
 
 
 @pytest.mark.mpi
-def test_reduce_scatter(mpi_test):
-    d = mpi_test.size
+def test_reduce_scatter(multidevice_test):
+    d = multidevice_test.size
     mesh = nvfuser.DeviceMesh(range(d))
 
     class Model(FusionDefinition):
@@ -94,18 +94,18 @@ def multidevice_schedule(self):
             self.sched.set_allocation_as_loop(self.out)
 
     unsharded = torch.randn(d, d * 4)
-    sharded = mpi_test.shard_tensor(unsharded, 0, mesh)
+    sharded = multidevice_test.shard_tensor(unsharded, 0, mesh)
 
     fd = Model()
     outputs = fd.execute([sharded])
     torch.testing.assert_close(
-        outputs[0], mpi_test.shard_tensor(unsharded.sum(0), 0, mesh)
+        outputs[0], multidevice_test.shard_tensor(unsharded.sum(0), 0, mesh)
     )
 
 
 @pytest.mark.mpi
-def test_reduce_scatter_noncontiguous(mpi_test):
-    d = mpi_test.size
+def test_reduce_scatter_noncontiguous(multidevice_test):
+    d = multidevice_test.size
     mesh = nvfuser.DeviceMesh(range(d))
 
     class Model(FusionDefinition):
@@ -136,10 +136,10 @@ def multidevice_schedule(self):
             self.sched.set_allocation_as_loop(self.out)
 
     unsharded = torch.randn(d, 3, d * 4)
-    sharded = mpi_test.shard_tensor(unsharded, 0, mesh)
+    sharded = multidevice_test.shard_tensor(unsharded, 0, mesh)
 
     fd = Model()
     outputs = fd.execute([sharded])
     torch.testing.assert_close(
-        outputs[0], mpi_test.shard_tensor(unsharded.sum(0), 1, mesh)
+        outputs[0], multidevice_test.shard_tensor(unsharded.sum(0), 1, mesh)
     )
diff --git a/tests/python/test_multidevice.py b/tests/python/test_multidevice.py
index 953561d9dd2..8c2d2a58286 100644
--- a/tests/python/test_multidevice.py
+++ b/tests/python/test_multidevice.py
@@ -7,22 +7,22 @@
 from enum import Enum, auto
 from torch.nn.attention import SDPBackend
 
-import mpi_fixtures
+import multidevice_fixtures
 import nvfuser
 import utils
 from nvfuser import DataType, FusionDefinition
 
 
-mpi_test = mpi_fixtures.mpi_test
+multidevice_test = multidevice_fixtures.multidevice_test
 
 
 @pytest.mark.mpi
-def test_sizes_and_ranks(mpi_test):
+def test_sizes_and_ranks(multidevice_test):
     size, rank, local_size, local_rank = (
-        mpi_test.size,
-        mpi_test.rank,
-        mpi_test.local_size,
-        mpi_test.local_rank,
+        multidevice_test.size,
+        multidevice_test.rank,
+        multidevice_test.local_size,
+        multidevice_test.local_rank,
     )
     assert size > 0
     assert rank >= 0 and rank < size
@@ -31,8 +31,8 @@ def test_sizes_and_ranks(mpi_test):
 
 
 @pytest.mark.mpi
-def test_pointwise(mpi_test):
-    num_devices = mpi_test.size
+def test_pointwise(multidevice_test):
+    num_devices = multidevice_test.size
     mesh = nvfuser.DeviceMesh(range(num_devices))
 
     class Model(FusionDefinition):
@@ -51,7 +51,7 @@ def multidevice_schedule(self):
             self.sched.parallelize(self.t0, 0, nvfuser.ParallelType.mesh_x)
 
     unsharded_input = torch.randn(num_devices, 4)
-    sharded_input = mpi_test.shard_tensor(unsharded_input, 0, mesh)
+    sharded_input = multidevice_test.shard_tensor(unsharded_input, 0, mesh)
 
     fd = Model()
     outputs = fd.execute([sharded_input])
@@ -59,7 +59,7 @@ def multidevice_schedule(self):
 
 
 @pytest.mark.mpi
-def test_linear(mpi_test):
+def test_linear(multidevice_test):
     class Model(FusionDefinition):
         def __init__(self, num_devices, batch, sequence, hidden):
             super().__init__()
@@ -83,10 +83,10 @@ def multidevice_schedule(self):
             for t in [self.weight, self.bias]:
                 self.sched.parallelize(t, 0, nvfuser.ParallelType.mesh_x)
 
-    d = mpi_test.size
-    rank = mpi_test.rank
+    d = multidevice_test.size
+    rank = multidevice_test.rank
 
-    torch.cuda.set_device(mpi_test.local_rank)
+    torch.cuda.set_device(multidevice_test.local_rank)
 
     b, s, e = 2, 1024, 768
     inp_tensor = torch.randn(b, s, e, device="cuda")
@@ -112,8 +112,70 @@ def multidevice_schedule(self):
 
 
 @pytest.mark.mpi
-def test_matmul_allreduce(mpi_test):
-    d, b, s, e = mpi_test.size, 1, 4, 8
+def test_linear_loop_split(multidevice_test):
+    class Model(FusionDefinition):
+        def __init__(self, num_devices, batch, sequence, hidden):
+            super().__init__()
+            self._num_devices = num_devices
+            self._batch = batch
+            self._sequence = sequence
+            self._hidden = hidden
+
+        def definition(self):
+            d, b, s, e = self._num_devices, self._batch, self._sequence, self._hidden
+            self.inp = self.define_tensor([b, s, e])
+            self.weight = self.define_tensor([d * e, e])
+            self.bias = self.define_tensor([d * e])
+            self.out = self.ops.linear(self.inp, self.weight, self.bias)
+            self.add_output(self.out)
+
+        def multidevice_schedule(self):
+            for t in [self.inp, self.weight, self.bias, self.out]:
+                self.sched._set_device_mesh(t, mesh)
+
+            # Shard N for weight (N, K) and bias (N)
+            for t in [self.weight, self.bias]:
+                self.sched.split(t, 0, d, False)
+                self.sched.parallelize(t, 0, nvfuser.ParallelType.mesh_x)
+                self.sched.set_allocation_as_loop(t)
+
+            # Output of linear: {.., i{M}, i{N}, r{K}}
+            # Shard N -> axis(-2)
+            self.sched.split(self.out, -2, d, False)
+            self.sched.parallelize(self.out, -3, nvfuser.ParallelType.mesh_x)
+            self.sched.set_allocation_as_loop(self.out)
+
+    d = multidevice_test.size
+    mesh = nvfuser.DeviceMesh(range(d))
+
+    torch.cuda.set_device(multidevice_test.local_rank)
+
+    b, s, e = 2, 1024, 768
+    inp_tensor = torch.randn(b, s, e, device="cuda")
+    unsharded_weight_tensor = torch.randn(d * e, e)
+    sharded_weight_tensor = multidevice_test.shard_tensor(
+        unsharded_weight_tensor, 0, mesh
+    )
+    unsharded_bias_tensor = torch.randn(d * e)
+    sharded_bias_tensor = multidevice_test.shard_tensor(unsharded_bias_tensor, 0, mesh)
+
+    fd = Model(d, b, s, e)
+    out_tensors = fd.execute([inp_tensor, sharded_weight_tensor, sharded_bias_tensor])
+
+    # [b, s, d*e]
+    unsharded_out_tensor = torch.nn.functional.linear(
+        inp_tensor.cpu(), unsharded_weight_tensor, unsharded_bias_tensor
+    )
+    expected_out_tensor = multidevice_test.shard_tensor(unsharded_out_tensor, -1, mesh)
+    # rtol is the same as the default for fp32. atol is slightly increased.
+    torch.testing.assert_close(
+        out_tensors[0], expected_out_tensor, rtol=1.3e-6, atol=1e-3
+    )
+
+
+@pytest.mark.mpi
+def test_matmul_allreduce(multidevice_test):
+    d, b, s, e = multidevice_test.size, 1, 4, 8
 
     class Model(FusionDefinition):
         def definition(self) -> None:
@@ -137,9 +199,9 @@ def multidevice_schedule(self) -> None:
                 self.sched._set_device_mesh(t, mesh)
                 self.sched.parallelize(t, 0, nvfuser.ParallelType.mesh_x)
 
-    rank = mpi_test.rank
+    rank = multidevice_test.rank
 
-    torch.cuda.set_device(mpi_test.local_rank)
+    torch.cuda.set_device(multidevice_test.local_rank)
 
     unsharded_out_grad = torch.randn(b * s, d * e, dtype=torch.half, device="cpu")
     unsharded_weight = torch.randn(d * e, e, dtype=torch.half, device="cpu")
@@ -172,8 +234,8 @@ class QkvFormat(Enum):
 )
 @pytest.mark.parametrize("qkv_format", [QkvFormat.BHSE, QkvFormat.BSHE])
 @pytest.mark.mpi
-def test_sdpa(mpi_test, qkv_format: QkvFormat):
-    d, b, s, h, e = mpi_test.size, 2, 1024, 12, 768
+def test_sdpa(multidevice_test, qkv_format: QkvFormat):
+    d, b, s, h, e = multidevice_test.size, 2, 1024, 12, 768
 
     if h % d != 0:
         pytest.skip(f"We only support even split, so {h} has to be divisible by {d}.")
@@ -232,7 +294,7 @@ def multidevice_schedule(self) -> None:
                 self.sched._set_device_mesh(t, mesh)
                 self.sched.parallelize(t, 0, nvfuser.ParallelType.mesh_x)
 
-    torch.cuda.set_device(mpi_test.local_rank)
+    torch.cuda.set_device(multidevice_test.local_rank)
 
     def make_unsharded_tensor() -> torch.Tensor:
         return torch.randn(b, h, s, e // h, dtype=torch.bfloat16, device="cuda")
@@ -247,7 +309,7 @@ def make_unsharded_tensor() -> torch.Tensor:
         expected_out.backward(out_grad)
         expected_q_grad, expected_k_grad, expected_v_grad = q.grad, k.grad, v.grad
 
-    rank = mpi_test.rank
+    rank = multidevice_test.rank
 
     # Head-parallelize Q, K, V or the attention output of an SDPA.
     def head_parallelize(t: torch.Tensor) -> torch.Tensor:
@@ -693,9 +755,9 @@ def _assert_shape_dtype(
     reason="Flash Attention is only supported on Ampere and newer devices.",
 )
 @pytest.mark.mpi
-def test_transformer_forward(mpi_test, benchmark):
-    d = mpi_test.size
-    rank = mpi_test.rank
+def test_transformer_forward(multidevice_test, benchmark):
+    d = multidevice_test.size
+    rank = multidevice_test.rank
 
     b, s, h, e = 1, 2048, 96, 12288
 
@@ -714,7 +776,7 @@ def test_transformer_forward(mpi_test, benchmark):
         "error. So I use `assert` instead of `pytest.skip`."
     )
 
-    torch.cuda.set_device(mpi_test.local_rank)
+    torch.cuda.set_device(multidevice_test.local_rank)
 
     # To reduce memory footprint, create unsharded data on CPU and copy only
     # the needed slice to GPU.
@@ -1280,13 +1342,13 @@ def multidevice_schedule(self):
     reason="Flash Attention is only supported on Ampere and newer devices.",
 )
 @pytest.mark.mpi
-def test_transformer_backward(mpi_test, benchmark):
-    d = mpi_test.size
-    rank = mpi_test.rank
+def test_transformer_backward(multidevice_test, benchmark):
+    d = multidevice_test.size
+    rank = multidevice_test.rank
 
     b, s, h, e = 1, 2048, 96, 12288
 
-    torch.cuda.set_device(mpi_test.local_rank)
+    torch.cuda.set_device(multidevice_test.local_rank)
 
     mlp_linear0_out = torch.testing.make_tensor(
         d, b, s, e * 4 // d, dtype=torch.bfloat16, device="cpu"
diff --git a/tests/python/test_ops.py b/tests/python/test_ops.py
index d653e005736..bc842ea29dc 100644
--- a/tests/python/test_ops.py
+++ b/tests/python/test_ops.py
@@ -63,7 +63,7 @@ def parse_args_fusion_execution(opinfo: OpInfo, *args):
         else [ArgumentType.Symbolic] * len(args)
     )
 
-    assert len(symbolic_parameter_list) == len(args)
+    assert len(symbolic_parameter_list) >= len(args)
 
     result = []
     for arg_type, a in zip(symbolic_parameter_list, args):
diff --git a/tests/python/test_python_frontend.py b/tests/python/test_python_frontend.py
index 0b74fddeae6..7b7e1a40218 100644
--- a/tests/python/test_python_frontend.py
+++ b/tests/python/test_python_frontend.py
@@ -1204,6 +1204,20 @@ def fusion_func(fd: FusionDefinition):
         self.assertEqual(eager_out2, nvf_out[1])
         # self.assertEqual(eager_out3, nvf_out[2])
 
+    def test_triu(self):
+        inputs = [
+            torch.randn(4, 16, device="cuda", dtype=torch.float16),
+        ]
+
+        def fusion_func(fd: FusionDefinition) -> None:
+            t0 = fd.from_pytorch(inputs[0])
+            t1 = fd.ops.triu(t0, -1)
+            fd.add_output(t1)
+
+        nvf_out, _ = self.exec_nvfuser(fusion_func, inputs)
+        eager_out0 = torch.triu(inputs[0], -1)
+        self.assertEqual(eager_out0, nvf_out[0])
+
     def test_complex_rsqrt(self):
         inputs = [
             torch.randn(4, device="cuda", dtype=torch.complex64),
diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py
index 5c71633ecf1..fa4601f2900 100644
--- a/tests/python/test_transformer_engine.py
+++ b/tests/python/test_transformer_engine.py
@@ -2,19 +2,48 @@
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 
+import os
 import pytest
 import torch
 import torch.distributed as dist
+import transformer_engine.pytorch as te
 from enum import auto, Enum
 from functools import partial
+from mpi4py import MPI
 
 
-import transformer_engine.pytorch as te
+class MpiTest:
+    def __init__(self):
+        self._communicator = MPI.COMM_WORLD
+        self._local_size = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"])
+        self._local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
+
+    @property
+    def size(self):
+        return self._communicator.size
+
+    @property
+    def rank(self):
+        return self._communicator.rank
+
+    @property
+    def local_size(self):
+        return self._local_size
+
+    @property
+    def local_rank(self):
+        return self._local_rank
 
-import mpi_fixtures
+    def barrier(self):
+        self._communicator.barrier()
 
 
-mpi_test = mpi_fixtures.mpi_test
+@pytest.fixture(scope="session")
+def mpi_test():
+    fixture = MpiTest()
+    yield fixture
+    # Sync all ranks after each test for isolation.
+    fixture.barrier()
 
 
 class ComputeType(Enum):