diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index deeaed34ac..f256e8eec6 100644
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -11,10 +12,16 @@ set -o xtrace
 build_qnn_backend() {
   echo "Start building qnn backend."
   export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release
+  # Workaround to avoid issues around missing flatccrt library (depending on the
+  # number of jobs used), see issue #7300:
+  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
+  # available.
+  # TODO: Remove this workaround once the underlying issue is fixed.
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
 }
 
 set_up_aot() {
diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh
index 1280974812..45588e291e 100644
--- a/.ci/scripts/setup-qnn-deps.sh
+++ b/.ci/scripts/setup-qnn-deps.sh
@@ -16,9 +16,9 @@ install_qnn() {
   QNN_INSTALLATION_DIR=/tmp/qnn
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
-  curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
+  curl -Lo /tmp/v2.28.0.24.10.29.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip"
   echo "Finishing downloading qnn sdk."
-  unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
+  unzip -qo /tmp/v2.28.0.24.10.29.zip -d /tmp
   echo "Finishing unzip qnn sdk."
 
 
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index ddc7ad4618..550a09e4c6 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -121,7 +121,7 @@ echo "COREML option ${COREML}"
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 9c2a074372..0b4e27e5aa 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -28,6 +28,7 @@
 )
 from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_select import DecomposeSelectPass
 from executorch.backends.arm._passes.decompose_softmaxes_pass import (
     DecomposeSoftmaxesPass,
 )
@@ -62,7 +63,6 @@
 )
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
-from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_manager import PassManager
 
@@ -72,9 +72,7 @@ class ArmPassManager(PassManager):
     def _transform(self, graph_module: torch.fx.GraphModule):
         return self(graph_module).graph_module
 
-    def transform_to_backend_pipeline(
-        self, exported_program: ExportedProgram, compile_spec: list[CompileSpec]
-    ):
+    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
         """Apply passes before transforming program to backend"""
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(DecomposeLinearPass())
@@ -137,11 +135,8 @@ def transform_to_backend_pipeline(
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSoftmaxesPass())
-        for spec in compile_spec:
-            if spec.key == "permute_memory_format":
-                memory_format = spec.value.decode()
-                if memory_format == "nhwc":
-                    self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(DecomposeSelectPass())
+        self.add_pass(AnnotateChannelsLastDimOrder())
 
         return self._transform(exported_program.graph_module)
 
diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py
new file mode 100644
index 0000000000..9ea836e633
--- /dev/null
+++ b/backends/arm/_passes/decompose_select.py
@@ -0,0 +1,56 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class DecomposeSelectPass(ExportPass):
+    """
+    This pass decomposes select into slice + squeeze to ensure that Aten and TOSA outputs has the same rank (input rank -1)
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+
+            if node.op != "call_function":
+                continue
+
+            if node.target in (
+                exir_ops.edge.aten.select.int,
+                exir_ops.edge.aten.select_copy.int,
+            ):
+                slice_op = exir_ops.edge.aten.slice_copy.Tensor
+                squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+            else:
+                continue
+
+            input_node, dim, index = node.args
+
+            rank = len(input_node.meta["val"].size())
+            dim = dim % rank if dim < 0 else dim
+            index = index % rank if index < 0 else index
+            dim_list = list(range(rank))
+
+            with graph_module.graph.inserting_before(node):
+                slice_node = create_node(
+                    graph_module.graph, slice_op, (input_node, dim, index, index + 1)
+                )
+                squeeze_node = create_node(
+                    graph_module.graph, squeeze_op, (slice_node, dim_list)
+                )
+
+            node.replace_all_uses_with(squeeze_node)
+            graph_module.graph.erase_node(node)
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index e2fdc42b11..4ce95fda43 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -49,8 +49,6 @@ def __init__(self):
         self.compiler_flags = []
         self.output_format = None
         self.path_for_intermediates = None
-        # TODO MLETORCH-265 Remove permute_nhwc flag
-        self.permute_nhwc = False
         self.quantize_io = False
         self.tosa_version = None
         self.input_order = None
@@ -118,16 +116,6 @@ def dump_intermediate_artifacts_to(
         self.path_for_intermediates = output_path
         return self
 
-    def set_permute_memory_format(
-        self, set_nhwc_permutation: bool = True
-    ) -> "ArmCompileSpecBuilder":
-        """
-        Permute to channel last in compiler and runtime. Compilation and
-        runtime will convert rank 4 inputs to channel last for each sub-graph.
-        """
-        self.permute_nhwc = set_nhwc_permutation
-        return self
-
     def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         """
         Quantization of inputs and dequantization of outputs for cases where
@@ -170,11 +158,6 @@ def build(self) -> List[CompileSpec]:
                 CompileSpec("debug_artifact_path", self.path_for_intermediates.encode())
             )
 
-        if self.permute_nhwc:
-            self.compile_spec.append(
-                CompileSpec("permute_memory_format", "nhwc".encode())
-            )
-
         if self.input_order:
             self.compile_spec.append(
                 CompileSpec(
@@ -188,13 +171,6 @@ def build(self) -> List[CompileSpec]:
         return self.compile_spec
 
 
-def is_permute_memory(compile_spec: List[CompileSpec]) -> bool:
-    for spec in compile_spec:
-        if spec.key == "permute_memory_format":
-            return spec.value.decode() == "nhwc"
-    return False
-
-
 def is_tosa(compile_spec: List[CompileSpec]) -> bool:
     for spec in compile_spec:
         if spec.key == "output_format":
@@ -264,7 +240,7 @@ def preprocess(  # noqa: C901
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
         graph_module = ArmPassManager().transform_to_backend_pipeline(
-            exported_program=edge_program, compile_spec=compile_spec
+            exported_program=edge_program
         )
 
         node_visitors = get_node_visitors(edge_program, tosa_spec)
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index ee5f2807a9..157e5ec092 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -30,7 +30,6 @@
     op_repeat,
     op_rshift,
     op_rsqrt,
-    op_select,
     op_sigmoid,
     op_slice,
     op_squeeze,
diff --git a/backends/arm/operators/op_select.py b/backends/arm/operators/op_select.py
deleted file mode 100644
index b047a5dd47..0000000000
--- a/backends/arm/operators/op_select.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-from typing import List
-
-import serializer.tosa_serializer as ts
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-
-from executorch.backends.arm.tosa_mapping import TosaArg
-
-from executorch.backends.arm.tosa_utils import build_reshape, tosa_shape
-from serializer.tosa_serializer import TosaOp
-from torch.fx import Node
-
-
-@register_node_visitor
-class SelectVisitor(NodeVisitor):
-    target = "aten.select_copy.int"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-
-        assert len(inputs) == 3
-        input_node, dim, index = inputs
-        shape = input_node.shape
-        rank = len(shape)
-
-        dim = dim.number % rank if dim.number < 0 else dim.number
-        index = index.number % rank if index.number < 0 else index.number
-
-        # For aten.select_copy, the output will be rank[input_shape - 1]
-        # For TOSA rank(in) == rank(out).
-        # Add an intermediate with the same rank
-        expanded_shape = tuple(1 if i == dim else shape[i] for i in range(rank))
-        expanded_shape = tosa_shape(expanded_shape, input_node.dim_order)
-
-        output_reshaped = tosa_graph.addIntermediate(expanded_shape, output.dtype)
-
-        attr_slice = ts.TosaSerializerAttribute()
-
-        start_attr = [index if i == dim else 0 for i in input_node.dim_order]
-        size_attr = [
-            1 if i == dim else input_node.shape[i] for i in input_node.dim_order
-        ]
-
-        attr_slice.SliceAttribute(start_attr, size_attr)
-
-        tosa_graph.addOperator(
-            TosaOp.Op().SLICE, [input_node.name], [output_reshaped.name], attr_slice
-        )
-
-        # Reshape back to original rank of output.
-        build_reshape(tosa_graph, output_reshaped.name, output.shape, output.name)
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 2cc716391b..843e48603b 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 Arm Limited and/or its affiliates.
+ * Copyright 2023-2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -76,7 +76,6 @@ namespace arm {
 
 typedef struct {
   FreeableBuffer* processed;
-  bool permuted_io_flag;
 } ExecutionHandle;
 
 extern "C" {
@@ -125,14 +124,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle);
     handle->processed = processed;
 
-    handle->permuted_io_flag = false;
-    for (auto& compile_spec : compile_specs) {
-      if (0 == std::strcmp(compile_spec.key, "permute_memory_format") &&
-          0 == std::memcmp(compile_spec.value.buffer, "nhwc", 4)) {
-        handle->permuted_io_flag = true;
-      }
-    }
-
     // Return the same buffer we were passed - this data will be
     // executed directly
     return handle;
@@ -225,11 +216,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       // which require permutation.
       bool permuted_input_shape;
       ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
-          i,
-          tensor_in,
-          &handles.inputs->io[i],
-          execution_handle->permuted_io_flag,
-          &permuted_input_shape));
+          i, tensor_in, &handles.inputs->io[i], &permuted_input_shape));
       bool both_char = tensor_in.scalar_type() == ScalarType::Char and
           handles.inputs->io[i].elem_size == 1;
       bool both_int = tensor_in.scalar_type() == ScalarType::Int and
@@ -330,11 +317,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
 
       bool permuted_output_shape;
       ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
-          i,
-          tensor_out,
-          &handles.outputs->io[i],
-          execution_handle->permuted_io_flag,
-          &permuted_output_shape));
+          i, tensor_out, &handles.outputs->io[i], &permuted_output_shape));
       if (tensor_out.scalar_type() == ScalarType::Char and
           permuted_output_shape) {
         EXECUTORCH_PROF_SCOPE(
@@ -395,7 +378,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       int index,
       const executorch::aten::Tensor tensor,
       VelaIO* io,
-      bool permuted_io_flag,
       bool* is_permuted) const {
     bool permuted_shape = false;
 
@@ -409,12 +391,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       if (permuted_shape) {
         ET_LOG(Debug, "Tensor input/output %d will be permuted", index);
       }
-      if (permuted_io_flag != permuted_shape) {
-        ET_LOG(
-            Error,
-            "Permute compile flag and permuted input/output don't agree");
-        return Error::InvalidProgram;
-      }
     }
     *is_permuted = permuted_shape;
     return Error::Ok;
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 8838cb72d6..ba80f1c2d7 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -56,19 +56,15 @@ def maybe_get_tosa_collate_path() -> str | None:
     return None
 
 
-def get_tosa_compile_spec(
-    tosa_version: str, permute_memory_to_nhwc=True, custom_path=None
-) -> list[CompileSpec]:
+def get_tosa_compile_spec(tosa_version: str, custom_path=None) -> list[CompileSpec]:
     """
     Default compile spec for TOSA tests.
     """
-    return get_tosa_compile_spec_unbuilt(
-        tosa_version, permute_memory_to_nhwc, custom_path
-    ).build()
+    return get_tosa_compile_spec_unbuilt(tosa_version, custom_path).build()
 
 
 def get_tosa_compile_spec_unbuilt(
-    tosa_version: str, permute_memory_to_nhwc=False, custom_path=None
+    tosa_version: str, custom_path=None
 ) -> ArmCompileSpecBuilder:
     """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -81,7 +77,6 @@ def get_tosa_compile_spec_unbuilt(
     compile_spec_builder = (
         ArmCompileSpecBuilder()
         .tosa_compile_spec(tosa_version)
-        .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(custom_path)
     )
 
@@ -89,7 +84,6 @@ def get_tosa_compile_spec_unbuilt(
 
 
 def get_u55_compile_spec(
-    permute_memory_to_nhwc=True,
     quantize_io=False,
     custom_path=None,
     reorder_inputs=None,
@@ -98,7 +92,6 @@ def get_u55_compile_spec(
     Default compile spec for Ethos-U55 tests.
     """
     return get_u55_compile_spec_unbuilt(
-        permute_memory_to_nhwc,
         quantize_io=quantize_io,
         custom_path=custom_path,
         reorder_inputs=reorder_inputs,
@@ -106,7 +99,6 @@ def get_u55_compile_spec(
 
 
 def get_u85_compile_spec(
-    permute_memory_to_nhwc=True,
     quantize_io=False,
     custom_path=None,
     reorder_inputs=None,
@@ -115,7 +107,6 @@ def get_u85_compile_spec(
     Default compile spec for Ethos-U85 tests.
     """
     return get_u85_compile_spec_unbuilt(
-        permute_memory_to_nhwc,
         quantize_io=quantize_io,
         custom_path=custom_path,
         reorder_inputs=reorder_inputs,
@@ -123,7 +114,6 @@ def get_u85_compile_spec(
 
 
 def get_u55_compile_spec_unbuilt(
-    permute_memory_to_nhwc=True,
     quantize_io=False,
     custom_path=None,
     reorder_inputs=None,
@@ -143,7 +133,6 @@ def get_u55_compile_spec_unbuilt(
             extra_flags="--debug-force-regor --output-format=raw",
         )
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
-        .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(artifact_path)
         .set_input_order(reorder_inputs)
     )
@@ -151,7 +140,6 @@ def get_u55_compile_spec_unbuilt(
 
 
 def get_u85_compile_spec_unbuilt(
-    permute_memory_to_nhwc=True,
     quantize_io=False,
     custom_path=None,
     reorder_inputs=None,
@@ -169,7 +157,6 @@ def get_u85_compile_spec_unbuilt(
             extra_flags="--output-format=raw",
         )
         .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
-        .set_permute_memory_format(permute_memory_to_nhwc)
         .dump_intermediate_artifacts_to(artifact_path)
         .set_input_order(reorder_inputs)
     )
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index b5ff882537..b2fc271ade 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -111,7 +111,6 @@ def test_numerical_diff_prints(self):
                 example_inputs=model.get_inputs(),
                 compile_spec=common.get_tosa_compile_spec(
                     "TOSA-0.80+MI",
-                    permute_memory_to_nhwc=True,
                     custom_path=tempfile.mkdtemp("diff_print_test"),
                 ),
             )
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index fca743a6fa..d29695dedf 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -56,9 +56,7 @@ def test_mv2_tosa_MI(self):
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
             .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
@@ -71,9 +69,7 @@ def test_mv2_tosa_BI(self):
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
             )
             .quantize()
             .export()
@@ -92,7 +88,7 @@ def test_mv2_u55_BI(self):
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
-                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=common.get_u55_compile_spec(),
             )
             .quantize()
             .export()
@@ -112,7 +108,7 @@ def test_mv2_u85_BI(self):
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
-                compile_spec=common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+                compile_spec=common.get_u85_compile_spec(),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 24faace007..0aa3c6cba9 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -137,7 +137,7 @@ def test_add_u55_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
         self._test_add_ethos_BI_pipeline(
             self.Add(),
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             test_data,
         )
 
@@ -147,7 +147,7 @@ def test_add_u85_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
         self._test_add_ethos_BI_pipeline(
             self.Add(),
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             test_data,
         )
 
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index 27629701c3..bc37fbb136 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -57,9 +57,7 @@ def _test_avgpool2d_tosa_MI_pipeline(
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
             .check(["torch.ops.aten.avg_pool2d.default"])
@@ -81,7 +79,7 @@ def _test_avgpool2d_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
@@ -155,7 +153,7 @@ def test_avgpool2d_tosa_u55_BI(
     ):
         self._test_avgpool2d_tosa_ethos_BI_pipeline(
             self.AvgPool2d(*model_params),
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             (test_data,),
         )
 
@@ -169,6 +167,6 @@ def test_avgpool2d_tosa_u85_BI(
     ):
         self._test_avgpool2d_tosa_ethos_BI_pipeline(
             self.AvgPool2d(*model_params),
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             (test_data,),
         )
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 0b830fa46b..06470d91e8 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -159,7 +159,7 @@ def test_bmm_u55_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor)
             self.BMM(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMM.test_parameters[:1])
+    @parameterized.expand(BMM.test_parameters)
     @pytest.mark.corstone_fvp
     def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
@@ -167,15 +167,6 @@ def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
             self.BMM(), common.get_u85_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMM.test_parameters[1:])
-    @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
-    def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
-        test_data = (operand1, operand2)
-        self._test_bmm_ethosu_BI_pipeline(
-            self.BMM(), common.get_u85_compile_spec(), test_data
-        )
-
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
     @parameterized.expand(BMMSingleInput.test_parameters)
     @pytest.mark.corstone_fvp
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 593260ac56..b754a91f36 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -228,7 +228,7 @@ def _test_conv1d_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -250,7 +250,7 @@ def _test_conv1d_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
@@ -291,18 +291,13 @@ def test_conv1d_tosa_MI(self, test_name, model):
     def test_conv1d_tosa_BI(self, test_name, model):
         self._test_conv1d_tosa_BI_pipeline(model, model.get_inputs())
 
-    # Expeted to fail as Conv1D requires transpoes which isn't supported on u55
     @parameterized.expand(testsuite)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
     def test_conv1d_u55_BI(self, test_name, model):
         self._test_conv1d_ethosu_BI_pipeline(
             model, common.get_u55_compile_spec(), model.get_inputs()
         )
 
-    # This specific test case has numerical errors on FVP, MLETORCH-520.
-    testsuite.remove(("5_3x2x128_st1", conv1d_5_3x2x128_st1))
-
     @parameterized.expand(testsuite)
     @pytest.mark.corstone_fvp
     def test_conv1d_u85_BI(self, test_name, model):
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 9ccac53940..bbcb421ce7 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -255,7 +255,7 @@ def _test_conv2d_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -277,7 +277,7 @@ def _test_conv2d_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
@@ -330,7 +330,7 @@ def test_conv2d_tosa_BI(self, test_name, model):
     @pytest.mark.corstone_fvp
     def test_conv2d_u55_BI(self, test_name, model):
         self._test_conv2d_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             model,
             model.get_inputs(),
         )
@@ -339,7 +339,7 @@ def test_conv2d_u55_BI(self, test_name, model):
     @pytest.mark.corstone_fvp
     def test_conv2d_u85_BI(self, test_name, model):
         self._test_conv2d_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             model,
             model.get_inputs(),
         )
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 4a5615f97c..8352727a1c 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -194,7 +194,7 @@ def _test_conv_combo_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -218,7 +218,7 @@ def _test_conv_combo_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
@@ -273,7 +273,7 @@ def test_conv_meandim_u55_BI(self):
         model = ComboConv2dMeandim()
         self._test_conv_combo_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             model.get_inputs(),
         )
 
@@ -282,7 +282,7 @@ def test_conv_meandim_u85_BI(self):
         model = ComboConv2dMeandim()
         self._test_conv_combo_ethos_BI_pipeline(
             model,
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             model.get_inputs(),
         )
 
@@ -334,7 +334,7 @@ def test_conv_relu6_u55_BI(self, test_data: torch.Tensor):
         model = ComboConvRelu6()
         test_data = (test_data,)
         self._test_conv_combo_ethos_BI_pipeline(
-            model, common.get_u55_compile_spec(permute_memory_to_nhwc=True), test_data
+            model, common.get_u55_compile_spec(), test_data
         )
 
     @parameterized.expand(ComboConvRelu6.test_data)
@@ -343,7 +343,7 @@ def test_conv_relu6_u85_BI(self, test_data: torch.Tensor):
         model = ComboConvRelu6()
         test_data = (test_data,)
         self._test_conv_combo_ethos_BI_pipeline(
-            model, common.get_u85_compile_spec(permute_memory_to_nhwc=True), test_data
+            model, common.get_u85_compile_spec(), test_data
         )
 
     ###############################
@@ -364,7 +364,7 @@ def test_block_bottleneck_residual_u55_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             model.get_inputs(),
         )
 
@@ -373,7 +373,7 @@ def test_block_bottleneck_residual_u85_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_ethos_BI_pipeline(
             model,
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             model.get_inputs(),
         )
 
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 3ce7584086..e183dcc9c6 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -152,9 +152,9 @@
 testsuite_conv2d = [
     ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1),
     ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
+    ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
     ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
     ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
-    ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
     ("two_dw_conv2d", two_dw_conv2d),
 ]
 
@@ -191,7 +191,7 @@ def _test_dw_conv_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -211,7 +211,7 @@ def _test_dw_conv_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
@@ -257,37 +257,37 @@ def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
     def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
 
-    testsuite_conv2d.remove(
-        ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1)
-    )  # Works
-
-    @parameterized.expand(testsuite_conv2d, skip_on_empty=True)
+    @parameterized.expand(testsuite_conv2d[:4], skip_on_empty=True)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
     def test_dw_conv2d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(
-                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
-            ),
+            common.get_u55_compile_spec(quantize_io=set_quantize_io),
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite_conv2d[4:], skip_on_empty=True)
+    @pytest.mark.corstone_fvp
+    @unittest.expectedFailure  # TODO: MLETORCH-516
+    def test_dw_conv2d_u55_BI_xfails(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv_ethos_BI_pipeline(
+            model,
+            common.get_u55_compile_spec(quantize_io=set_quantize_io),
             model.get_inputs(),
         )
 
-    # Expected to fail as conv1d needs transpose which is not supported
-    # on u55.
     @parameterized.expand(testsuite_conv1d, skip_on_empty=True)
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
     def test_dw_conv1d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(
-                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
-            ),
+            common.get_u55_compile_spec(quantize_io=set_quantize_io),
             model.get_inputs(),
         )
 
@@ -298,9 +298,7 @@ def test_dw_conv_u85_BI(
     ):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u85_compile_spec(
-                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
-            ),
+            common.get_u85_compile_spec(quantize_io=set_quantize_io),
             model.get_inputs(),
         )
 
@@ -313,8 +311,6 @@ def test_dw_conv_u85_BI_xfails(
     ):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u85_compile_spec(
-                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
-            ),
+            common.get_u85_compile_spec(quantize_io=set_quantize_io),
             model.get_inputs(),
         )
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index d5f6174469..062dbfacae 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -185,7 +185,7 @@ def test_div_tosa_BI(
         test_data = (input_, other_)
         self._test_div_tosa_BI_pipeline(self.Div(), test_data)
 
-    @parameterized.expand(test_data_suite[:2])
+    @parameterized.expand(test_data_suite[:3])
     @pytest.mark.corstone_fvp
     def test_div_u55_BI(
         self,
@@ -200,7 +200,7 @@ def test_div_u55_BI(
         )
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[2:])
+    @parameterized.expand(test_data_suite[3:])
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
     def test_div_u55_BI_xfails(
@@ -215,7 +215,7 @@ def test_div_u55_BI_xfails(
             self.Div(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(test_data_suite[:2])
+    @parameterized.expand(test_data_suite[:3])
     @pytest.mark.corstone_fvp
     def test_div_u85_BI(
         self,
@@ -230,7 +230,7 @@ def test_div_u85_BI(
         )
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[2:])
+    @parameterized.expand(test_data_suite[3:])
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
     def test_div_u85_BI_xfails(
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 1b6d6e6ae3..fc82fa4dd7 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -76,15 +76,12 @@ def _test_full_tosa_BI_pipeline(
         self,
         module: torch.nn.Module,
         test_data: Tuple,
-        permute_memory_to_nhwc: bool,
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=permute_memory_to_nhwc
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
             )
             .quantize()
             .export()
@@ -134,7 +131,7 @@ def test_const_full_tosa_MI(self):
 
     def test_const_full_nhwc_tosa_BI(self):
         _input = torch.rand((2, 2, 3, 3)) * 10
-        self._test_full_tosa_BI_pipeline(self.AddConstFull(), (_input,), True)
+        self._test_full_tosa_BI_pipeline(self.AddConstFull(), (_input,))
 
     @parameterized.expand(AddVariableFull.test_parameters)
     def test_full_tosa_MI(self, test_tensor: Tuple):
@@ -144,7 +141,7 @@ def test_full_tosa_MI(self, test_tensor: Tuple):
 
     @parameterized.expand(AddVariableFull.test_parameters)
     def test_full_tosa_BI(self, test_tensor: Tuple):
-        self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor, False)
+        self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor)
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 2d88421fb5..0570afc03e 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -78,7 +78,7 @@ def _test_layernorm_tosa_MI_pipeline(
                 model=module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -99,7 +99,7 @@ def _test_layernorm_tosa_BI_pipeline(
                 model=module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
@@ -158,9 +158,21 @@ def test_layer_norm_tosa_BI(
             self.LayerNorm(*model_params), (test_data,)
         )
 
+    @parameterized.expand(test_data_suite[4:])
+    @pytest.mark.corstone_fvp
+    def test_layer_norm_u55_BI(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params,
+    ):
+        self._test_layernorm_ethosu_BI_pipeline(
+            self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
+        )
+
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     # Skip tests that require transposes.
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite[:4])
     @pytest.mark.corstone_fvp
     @unittest.expectedFailure
     def test_layer_norm_u55_BI_xfails(
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index cd14b7801d..825b2f9bc9 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -137,7 +137,7 @@ def _test_linear_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -157,7 +157,7 @@ def _test_linear_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index 69c8ee06ec..d1581423a0 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -31,10 +31,6 @@
     ("ones", torch.ones(10, 10), 1),
     ("ones_neg_dim", torch.ones(10, 3, 4), -1),
     ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-]
-
-test_data_suite_u55_xfails = [
-    # (test_name, test_data, dim)
     ("zeros", torch.zeros(10, 8, 5, 2), 0),
     ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
     ("rand", torch.rand(1, 2, 5, 8), 2),
@@ -161,19 +157,6 @@ def test_logsoftmax_tosa_u55_BI(
             self.LogSoftmax(dim=dim), (test_data,)
         )
 
-    # Expected to fail as this is not supported on u55.
-    @parameterized.expand(test_data_suite_u55_xfails)
-    @unittest.expectedFailure
-    def test_logsoftmax_tosa_u55_BI_xfails(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
-        self._test_logsoftmax_tosa_u55_BI_pipeline(
-            self.LogSoftmax(dim=dim), (test_data,)
-        )
-
     @parameterized.expand(test_data_suite)
     def test_logsoftmax_tosa_u85_BI(
         self,
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index a693c7d549..81f27beab4 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -65,7 +65,7 @@ def _test_maxpool2d_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -92,7 +92,7 @@ def _test_maxpool2d_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
@@ -171,7 +171,7 @@ def test_maxpool2d_tosa_u55_BI(
     ):
         tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
             self.MaxPool2d(*model_params),
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             (test_data,),
         )
         if conftest.is_option_enabled("corstone_fvp"):
@@ -189,7 +189,7 @@ def test_maxpool2d_tosa_u85_BI(
     ):
         tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
             self.MaxPool2d(*model_params),
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             (test_data,),
         )
         if conftest.is_option_enabled("corstone_fvp"):
@@ -230,7 +230,7 @@ def test_maxpool2d_tosa_u55_BI_mult_batches(
     ):
         tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
             self.MaxPool2d(*model_params),
-            common.get_u55_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u55_compile_spec(),
             (test_data,),
         )
         if conftest.is_option_enabled("corstone_fvp"):
@@ -249,7 +249,7 @@ def test_maxpool2d_tosa_u85_BI_mult_batches(
     ):
         tester = self._test_maxpool2d_tosa_ethos_BI_pipeline(
             self.MaxPool2d(*model_params),
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             (test_data,),
         )
         if conftest.is_option_enabled("corstone_fvp"):
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index e4f6afcbd6..393cf1667e 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -269,10 +269,8 @@ def test_meandim_tosa_BI(
     ):
         self._test_meandim_tosa_BI_pipeline(self.MeanDim(dim, keepdim), (test_data,))
 
-    # Expected to fail as this is not supported on u55.
     @parameterized.expand(MeanDim.test_data_suite)
-    @unittest.expectedFailure
-    def test_meandim_tosa_u55_BI_xfails(
+    def test_meandim_tosa_u55_BI(
         self,
         test_name: str,
         test_data: torch.Tensor,
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 9d789a8e33..715673b87c 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -73,7 +73,7 @@ def _test_mul_tosa_MI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+MI",
                 ),
             )
             .export()
@@ -94,7 +94,7 @@ def _test_mul_tosa_BI_pipeline(
                 module,
                 example_inputs=test_data,
                 compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=True
+                    "TOSA-0.80+BI",
                 ),
             )
             .quantize()
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index b373af1401..ec7ecaa81b 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -53,15 +53,12 @@ def _test_permute_tosa_MI_pipeline(
         self,
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
-        permute_memory_to_nhwc: bool,
     ):
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=permute_memory_to_nhwc
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
             .check(["torch.ops.aten.permute.default"])
@@ -127,10 +124,8 @@ def _test_permute_ethos_BI_pipeline(
     def test_permute_tosa_MI(
         self, test_name: str, test_data: torch.Tensor, dims: list[int]
     ):
-        self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,), True)
-        self._test_permute_tosa_MI_pipeline(
-            self.Permute(dims=dims), (test_data,), False
-        )
+        self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,))
+        self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,))
 
     @parameterized.expand(test_data_suite)
     def test_permute_tosa_BI(
@@ -141,7 +136,6 @@ def test_permute_tosa_BI(
     # Expected to fail as TOSA.Transpose is not supported by Ethos-U55.
     @parameterized.expand(test_data_suite[0:1])
     @pytest.mark.corstone_fvp
-    @unittest.expectedFailure
     def test_permute_u55_BI(
         self, test_name: str, test_data: torch.Tensor, dims: list[int]
     ):
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
index f43f7af13c..bad872792b 100644
--- a/backends/arm/test/ops/test_repeat.py
+++ b/backends/arm/test/ops/test_repeat.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -107,20 +107,12 @@ def test_repeat_tosa_MI(self, test_input, multiples):
     def test_repeat_tosa_BI(self, test_input, multiples):
         self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples))
 
-    @parameterized.expand(Repeat.test_parameters[:-1])
+    @parameterized.expand(Repeat.test_parameters)
     def test_repeat_u55_BI(self, test_input, multiples):
         self._test_repeat_ethosu_pipeline(
             common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples)
         )
 
-    # Final test requires transpose which is not supported on u55.
-    @parameterized.expand(Repeat.test_parameters[-1:])
-    @unittest.expectedFailure
-    def test_repeat_u55_BI_xfails(self, test_input, multiples):
-        self._test_repeat_ethosu_pipeline(
-            common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples)
-        )
-
     @parameterized.expand(Repeat.test_parameters)
     def test_repeat_u85_BI(self, test_input, multiples):
         self._test_repeat_ethosu_pipeline(
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index c39b20a731..b474da573f 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -51,21 +51,19 @@ def _test_select_tosa_MI_pipeline(
         test_data: test_data_t,
         export_target: str,
     ):
-        # For 4D tensors, do not permute to NHWC
-        permute = False if len(test_data[0].shape) == 4 else True
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+MI", permute_memory_to_nhwc=permute
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
             )
             .export()
             .check([export_target])
             .check_not(["torch.ops.quantized_decomposed"])
             .to_edge()
+            .dump_artifact()
             .partition()
+            .dump_artifact()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .run_method_and_compare_outputs(inputs=test_data)
@@ -77,15 +75,11 @@ def _test_select_tosa_BI_pipeline(
         test_data: test_data_t,
         export_target: str,
     ):
-        # For 4D tensors, do not permute to NHWC
-        permute = False if len(test_data[0].shape) == 4 else True
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=permute
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
             )
             .quantize()
             .export()
@@ -124,10 +118,8 @@ def _test_select_ethos_BI_pipeline(
     def _test_select_tosa_u55_BI_pipeline(
         self, module: torch.nn.Module, test_data: test_data_t, export_target: str
     ):
-        # For 4D tensors, do not permute to NHWC
-        permute = False if len(test_data[0].shape) == 4 else True
         self._test_select_ethos_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=permute),
+            common.get_u55_compile_spec(),
             module,
             test_data,
             export_target,
@@ -136,10 +128,8 @@ def _test_select_tosa_u55_BI_pipeline(
     def _test_select_tosa_u85_BI_pipeline(
         self, module: torch.nn.Module, test_data: test_data_t, export_target: str
     ):
-        # For 4D tensors, do not permute to NHWC
-        permute = False if len(test_data[0].shape) == 4 else True
         self._test_select_ethos_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=permute),
+            common.get_u85_compile_spec(),
             module,
             test_data,
             export_target,
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index 511873a8c2..7cb82e3a82 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -52,16 +52,14 @@ def _test_slice_tosa_MI_pipeline(
         )
 
     def _test_slice_tosa_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor], permute: bool
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
 
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI", permute_memory_to_nhwc=permute
-                ),
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
             )
             .quantize()
             .export()
@@ -114,11 +112,11 @@ def test_slice_tosa_MI(self, tensor):
 
     @parameterized.expand(Slice.test_tensors[:2])
     def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), False)
+        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,))
 
     @parameterized.expand(Slice.test_tensors[2:])
     def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor):
-        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True)
+        self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,))
 
     @parameterized.expand(Slice.test_tensors)
     def test_slice_u55_BI(self, test_tensor: torch.Tensor):
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index fd78d1a9ac..794f6b791f 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -33,10 +33,6 @@
     ("ones", torch.ones(10, 10), 1),
     ("ones_neg_dim", torch.ones(10, 3, 4), -1),
     ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
-]
-
-test_data_suite_u55_xfails = [
-    # (test_name, test_data, dim)
     ("zeros", torch.zeros(10, 8, 5, 2), 0),
     ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
     ("rand", torch.rand(1, 2, 5, 8), 2),
@@ -161,17 +157,6 @@ def test_softmax_tosa_u55_BI(
     ):
         self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    # Expected to fail as this is not supported on u55.
-    @parameterized.expand(test_data_suite_u55_xfails)
-    @unittest.expectedFailure
-    def test_softmax_tosa_u55_BI_xfails(
-        self,
-        test_name: str,
-        test_data: torch.Tensor,
-        dim: int,
-    ):
-        self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
-
     @parameterized.expand(test_data_suite)
     def test_softmax_tosa_u85_BI(
         self,
diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py
index ac26fd73fa..9f02392e1e 100644
--- a/backends/arm/test/ops/test_squeeze.py
+++ b/backends/arm/test/ops/test_squeeze.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -136,7 +136,7 @@ def test_squeeze_u55_BI(
         test_tensor: torch.Tensor,
     ):
         self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            common.get_u55_compile_spec(),
             self.Squeeze(),
             (test_tensor,),
             "torch.ops.aten.squeeze.default",
@@ -148,7 +148,7 @@ def test_squeeze_u85_BI(
         test_tensor: torch.Tensor,
     ):
         self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             self.Squeeze(),
             (test_tensor,),
             "torch.ops.aten.squeeze.default",
@@ -169,7 +169,7 @@ def test_squeeze_dim_tosa_BI(self, test_tensor: torch.Tensor, dim: int):
     @parameterized.expand(SqueezeDim.test_parameters)
     def test_squeeze_dim_u55_BI(self, test_tensor: torch.Tensor, dim: int):
         self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            common.get_u55_compile_spec(),
             self.SqueezeDim(),
             (test_tensor, dim),
             "torch.ops.aten.squeeze.dim",
@@ -178,7 +178,7 @@ def test_squeeze_dim_u55_BI(self, test_tensor: torch.Tensor, dim: int):
     @parameterized.expand(SqueezeDim.test_parameters)
     def test_squeeze_dim_u85_BI(self, test_tensor: torch.Tensor, dim: int):
         self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
             self.SqueezeDim(),
             (test_tensor, dim),
             "torch.ops.aten.squeeze.dim",
@@ -199,7 +199,7 @@ def test_squeeze_dims_tosa_BI(self, test_tensor: torch.Tensor, dims: tuple[int])
     @parameterized.expand(SqueezeDims.test_parameters)
     def test_squeeze_dims_u55_BI(self, test_tensor: torch.Tensor, dims: tuple[int]):
         self._test_squeeze_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            common.get_u55_compile_spec(),
             self.SqueezeDims(),
             (test_tensor, dims),
             "torch.ops.aten.squeeze.dims",
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 098e0fd1bc..7f85cba4c3 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -39,9 +39,6 @@ class Sum(torch.nn.Module):
             ((torch.rand(10), 0, True),),
             ((torch.rand(10, 10), 1, False),),
             ((torch.rand(1, 2, 3, 4), 3, True),),
-        ]
-
-        test_parameters_u55_xfails: list[Tuple[exampledata_t]] = [
             ((torch.rand(10, 10, 10), [-3, 1], True),),
             ((torch.rand(2, 1, 5, 8), 1, False),),
             ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
@@ -129,17 +126,7 @@ def test_sum_u55_BI(self, test_data: tuple[exampledata_t]):
         self._test_sum_ethosu_BI_pipeline(
             self.Sum(),
             test_data,
-            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
-        )
-
-    # Expected to fail as this is not supported on u55.
-    @parameterized.expand(Sum.test_parameters_u55_xfails)
-    @unittest.expectedFailure
-    def test_sum_u55_BI_xfails(self, test_data: tuple[exampledata_t]):
-        self._test_sum_ethosu_BI_pipeline(
-            self.Sum(),
-            test_data,
-            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+            common.get_u55_compile_spec(),
         )
 
     @parameterized.expand(Sum.test_parameters)
@@ -147,5 +134,5 @@ def test_sum_u85_BI(self, test_data: tuple[exampledata_t]):
         self._test_sum_ethosu_BI_pipeline(
             self.Sum(),
             test_data,
-            common.get_u85_compile_spec(permute_memory_to_nhwc=True),
+            common.get_u85_compile_spec(),
         )
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
index a6faf70af0..68f4fe4612 100644
--- a/backends/arm/test/ops/test_unsqueeze.py
+++ b/backends/arm/test/ops/test_unsqueeze.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -95,7 +95,7 @@ def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor):
     def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor):
         self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0))
 
-    @parameterized.expand(Unsqueeze.test_parameters[:-1])
+    @parameterized.expand(Unsqueeze.test_parameters)
     def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor):
         self._test_unsqueeze_ethosu_BI_pipeline(
             common.get_u55_compile_spec(),
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index 322ac5b0ed..e1fed05817 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -158,10 +158,8 @@ def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction):
     def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_tosa_BI_pipeline(self.Var(), (test_tensor, keepdim, correction))
 
-    # Expected to fail as this is not supported on u55.
     @parameterized.expand(Var.test_parameters)
-    @unittest.expectedFailure
-    def test_var_u55_BI_xfails(self, test_tensor: torch.Tensor, keepdim, correction):
+    def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
             self.Var(),
             common.get_u55_compile_spec(),
@@ -196,18 +194,6 @@ def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correctio
             (test_tensor, dim, keepdim, correction),
         )
 
-    # Expected to fail as this is not supported on u55.
-    @parameterized.expand(VarDim.test_parameters_u55_xfails)
-    @unittest.expectedFailure
-    def test_var_dim_u55_BI_xfails(
-        self, test_tensor: torch.Tensor, dim, keepdim, correction
-    ):
-        self._test_var_ethosu_BI_pipeline(
-            self.VarDim(),
-            common.get_u55_compile_spec(),
-            (test_tensor, dim, keepdim, correction),
-        )
-
     @parameterized.expand(VarDim.test_parameters)
     def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
@@ -232,10 +218,8 @@ def test_var_correction_tosa_BI(
             self.VarCorrection(), (test_tensor, dim, keepdim, correction)
         )
 
-    # Expected to fail as this is not supported on u55.
     @parameterized.expand(VarCorrection.test_parameters)
-    @unittest.expectedFailure
-    def test_var_correction_u55_BI_xfails(
+    def test_var_correction_u55_BI(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index 1603a2a37d..f90ae40206 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -129,15 +129,10 @@ def test_view_tosa_MI(self, test_tensor: torch.Tensor, new_shape):
     def test_view_tosa_BI(self, test_tensor: torch.Tensor, new_shape):
         self._test_view_tosa_BI_pipeline(self.View(), (test_tensor, new_shape))
 
-    @parameterized.expand(View.no_transpose_tests)
+    @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests)
     def test_view_u55_BI(self, test_tensor: torch.Tensor, new_shape):
         self._test_view_u55_BI_pipeline(self.View(), (test_tensor, new_shape))
 
-    @parameterized.expand(View.needs_transpose_tests)
-    @unittest.expectedFailure
-    def test_view_transpose_u55_BI(self, test_tensor: torch.Tensor, new_shape):
-        self._test_view_u55_BI_pipeline(self.View(), (test_tensor, new_shape))
-
     @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests)
     def test_view_u85_BI(self, test_tensor: torch.Tensor, new_shape):
         self._test_view_u85_BI_pipeline(self.View(), (test_tensor, new_shape))
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index b3f5b4f05b..2b65c306be 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -15,7 +15,7 @@
 
 import torch.fx
 
-from executorch.backends.arm.arm_backend import get_intermediate_path, is_permute_memory
+from executorch.backends.arm.arm_backend import get_intermediate_path
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
 from executorch.backends.arm.quantizer.arm_quantizer import (
     ArmQuantizer,
@@ -329,7 +329,6 @@ def run_method_and_compare_outputs(
         logger.info(
             f"Comparing Stage '{self.stage_name(test_stage)}' with Stage '{self.stage_name(reference_stage)}'"
         )
-        is_nhwc = is_permute_memory(self.compile_spec)
 
         # Loop inputs and compare reference stage with the compared stage.
         for run_iteration in range(num_runs):
@@ -344,10 +343,7 @@ def run_method_and_compare_outputs(
                 if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor):
                     test_input.extend([tensor.clone() for tensor in arg])
 
-            if (
-                is_nhwc
-                and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
-            ):
+            if test_stage == self.stages[self.stage_name(tester.ToExecutorch)]:
                 test_input = self.transpose_data_format(test_input, "NHWC")
 
             input_shapes = [
@@ -359,10 +355,7 @@ def run_method_and_compare_outputs(
 
             reference_output = reference_stage.run_artifact(reference_input)
             test_output = test_stage.run_artifact(test_input)
-            if (
-                is_nhwc
-                and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
-            ):
+            if test_stage == self.stages[self.stage_name(tester.ToExecutorch)]:
                 test_output = self.transpose_data_format(test_output, "NCHW")
 
             self._compare_outputs(
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 6c71909c47..e2ac3de5ca 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -60,9 +60,6 @@ if(EXECUTORCH_CADENCE_CPU_RUNNER)
                                       ${_common_include_directories}
   )
 
-  set(TARGET_DIR reference)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
-
   target_link_libraries(
   cadence_runner
   executorch
@@ -78,12 +75,12 @@ endif()
 if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
-endif()
-
-if(EXECUTORCH_FUSION_G3_OPT)
+elseif(EXECUTORCH_FUSION_G3_OPT)
   set(TARGET_DIR fusion_g3)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
+else()
+  set(TARGET_DIR reference)
 endif()
 
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp
index be866b2f51..289baceb12 100644
--- a/backends/cadence/fusion_g3/operators/op_mean.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mean.cpp
@@ -59,7 +59,7 @@ int prepare_data(
   return num_axis_dims;
 }
 
-Tensor& mean_dim_out(
+Tensor& mean_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
@@ -199,4 +199,4 @@ Tensor& mean_dim_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
\ No newline at end of file
+} // namespace cadence
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 3d5a432431..699e064669 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -51,6 +51,11 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary(
   } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
     num_graphs = binaryinfo->contextBinaryInfoV2.numGraphs;
     graphs = binaryinfo->contextBinaryInfoV2.graphs;
+#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+    num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs;
+    graphs = binaryinfo->contextBinaryInfoV3.graphs;
+#endif
   } else {
     QNN_EXECUTORCH_LOG_WARN(
         "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
@@ -62,6 +67,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary(
       RetrieveGraphInfo<QnnSystemContext_GraphInfoV1_t>(graphs[i].graphInfoV1);
     } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) {
       RetrieveGraphInfo<QnnSystemContext_GraphInfoV2_t>(graphs[i].graphInfoV2);
+#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+    } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) {
+      RetrieveGraphInfo<QnnSystemContext_GraphInfoV3_t>(graphs[i].graphInfoV3);
+#endif
     } else {
       QNN_EXECUTORCH_LOG_WARN(
           "Unknown QNN GraphInfo version %d.", binaryinfo->version);
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
index 757034baa8..030b5666da 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
@@ -17,6 +17,9 @@ using executorch::runtime::Error;
 Error HtpBackendCache::RetrieveBackendBinaryInfo(
     const QnnSystemContext_BinaryInfo_t* binaryinfo) {
   QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
+#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr;
+#endif
 
   if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
     htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
@@ -24,27 +27,43 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo(
   } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
     htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
         binaryinfo->contextBinaryInfoV2.hwInfoBlob);
+#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) {
+    htp_graphblobinfo = static_cast<QnnHtpSystemContext_GraphBlobInfo_t*>(
+        binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo);
+#endif
   } else {
     QNN_EXECUTORCH_LOG_WARN(
         "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
     return Error::Internal;
   }
 
-  if (htp_hwblobinfo == nullptr) {
-    QNN_EXECUTORCH_LOG_WARN(
-        "Htp hardware blob information is not found in binary information.");
-    return Error::Ok;
+  if (htp_hwblobinfo) {
+    if (htp_hwblobinfo->version ==
+        QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) {
+      spill_fill_buf_ =
+          (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize;
+    } else {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version);
+      return Error::Internal;
+    }
   }
 
-  if (htp_hwblobinfo->version ==
-      QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) {
-    spill_fill_buf_ =
-        (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize;
-  } else {
-    QNN_EXECUTORCH_LOG_WARN(
-        "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version);
-    return Error::Internal;
+#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21)
+  if (htp_graphblobinfo) {
+    if (htp_graphblobinfo->version ==
+        QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) {
+      spill_fill_buf_ =
+          (*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize;
+    } else {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Unknown QNN Htp graph blob info version %d.",
+          htp_graphblobinfo->version);
+      return Error::Internal;
+    }
   }
+#endif
 
   return Error::Ok;
 }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 900854ccd7..8c76c11532 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -478,13 +478,6 @@ vTensor::vTensor(
   if (storage_type != utils::kBuffer) {
     set_logical_limits(storage_.image_extents_);
   }
-
-  if (dtype == vkapi::kHalf) {
-    VK_CHECK_COND(
-        api::context()->adapter_ptr()->supports_16bit_storage_buffers(),
-        "Half dtype is only available if the physical device supports float16 "
-        "storage buffers!");
-  }
 }
 
 // NOLINTNEXTLINE
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index e27723468a..bf501296b1 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -68,6 +68,8 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
+  context->check_device_capabilities(shader_);
+
   vTensorPtr packed = graph->get_tensor(packed_);
   api::StagingBuffer staging = create_staging_buffer(graph);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index ad4ff245a1..cd385718ce 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -14,6 +14,8 @@
 
 #define TILE_SIZE ${TILE_SIZE}
 
+#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION}
+
 #define BATCH_SIZE_X ${BATCH_SIZE_X}
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
@@ -40,6 +42,8 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
  */
+
+#if STRIDE_EQ_DILATION
 void main() {
   // x and y are divided by batch size to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
@@ -121,3 +125,42 @@ void main() {
     }
   }
 }
+
+#else
+void main() {
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
+  const ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits.x,
+    div_by_x % out_limits.y,
+    div_by_x / out_limits.y);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
+    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(t_out, pos, op(sum, out_min, out_max));
+}
+
+#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index 9cf6c22c6c..d3672f5ec2 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -12,6 +12,7 @@ conv2d_dw_output_tile:
     TILE_SIZE: 3
     BATCH_SIZE_X: 4
     BATCH_SIZE_Y: 2
+    STRIDE_EQ_DILATION: 0
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -25,3 +26,15 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
+    - NAME: conv2d_dw_sed_output_tile_3x3
+      STRIDE_EQ_DILATION: 1
+    - NAME: conv2d_dw_sed_output_tile_3x3_clamp
+      OPERATOR: clamp(X, A, B)
+      STRIDE_EQ_DILATION: 1
+    - NAME: conv2d_dw_sed_output_tile_5x5
+      TILE_SIZE: 5
+      STRIDE_EQ_DILATION: 1
+    - NAME: conv2d_dw_sed_output_tile_5x5_clamp
+      OPERATOR: clamp(X, A, B)
+      TILE_SIZE: 5
+      STRIDE_EQ_DILATION: 1
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 64c145fb7e..a7c11cc853 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -126,13 +126,17 @@ vkapi::ShaderInfo get_conv2d_shader(
     const bool prepack_weights,
     const Conv2dMethod method,
     const ValueRef weight,
-    const bool clamp_out = false) {
+    const bool clamp_out = false,
+    const bool stride_equals_dilation = false) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
   switch (method) {
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
       if (!prepack_weights) {
+        if (stride_equals_dilation) {
+          kernel_name += "_sed";
+        }
         const auto& weight_sizes = graph.get_tref(weight)->sizes;
         if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
           kernel_name += "_output_tile_3x3";
@@ -286,22 +290,37 @@ Conv2dMethod get_conv2d_method(
   return Conv2dMethod::SlidingWindow;
 }
 
+utils::uvec2 get_conv2d_dw_dispatch_divisor(
+    const std::vector<int64_t>& weight_sizes) {
+  if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
+    return {4u, 2u};
+  }
+  if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
+    return {4u, 2u};
+  }
+  return {4u, 2u};
+}
+
 utils::uvec3 create_conv2d_global_wg_size(
     ComputeGraph& graph,
     const Conv2dMethod method,
-    const ValueRef out) {
+    const ValueRef out,
+    const ValueRef weight_data,
+    const bool stride_equals_dilation) {
   if (method == Conv2dMethod::Pointwise) {
     const utils::uvec3 image_extents = graph.logical_limits_of(out);
     return {
         utils::div_up(image_extents[0u], 2u),
         utils::div_up(image_extents[1u], 2u),
         image_extents[2u]};
-  } else if (method == Conv2dMethod::Depthwise) {
-    const utils::uvec3 image_extents = graph.logical_limits_of(out);
+  } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
+    const utils::uvec3 image_extents = graph.create_global_wg_size(out);
+    const utils::uvec2 div =
+        get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes);
     return {
-        utils::div_up(image_extents[0u], 4u),
-        utils::div_up(image_extents[1u], 2u),
-        image_extents[2u]};
+        utils::div_up(image_extents[0], div[0]),
+        utils::div_up(image_extents[1], div[1]),
+        image_extents[2]};
   } else {
     return graph.create_global_wg_size(out);
   }
@@ -364,6 +383,10 @@ void add_conv2d_node(
   Conv2dParams extra_params =
       create_conv2d_params(graph, weight_data, kernel_params, transposed_val);
 
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
   OutputParams out_params = {out_min_val, out_max_val};
 
   check_conv2d_params(kernel_params, transposed_val);
@@ -374,9 +397,11 @@ void add_conv2d_node(
       /*prepack_weights = */ false,
       method,
       weight_data,
-      clamp_out);
+      clamp_out,
+      stride_equals_dilation);
 
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(graph, method, out);
+  utils::uvec3 wg_size = create_conv2d_global_wg_size(
+      graph, method, out, weight_data, stride_equals_dilation);
 
   if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
index 3a8a59166e..7976d0ddee 100644
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -151,7 +151,8 @@ VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
   // Staging buffers are accessed by both the CPU and GPU, so set the
   // appropriate flags to indicate that the host device will be accessing
   // the data from this buffer.
-  alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT |
+  alloc_create_info.flags |=
+      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
       VMA_ALLOCATION_CREATE_MAPPED_BIT;
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
   alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 85732d7701..d32fa71573 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -348,6 +348,39 @@ def get_conv_inputs():
                 [0, 0],
                 1,
             ),
+            (
+                (1, 4, 234, 234),
+                (4, 1, 3, 3),
+                (4,),
+                [2, 1],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                4,
+            ),
+            (
+                (1, 4, 234, 234),
+                (4, 1, 3, 3),
+                (4,),
+                [1, 2],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                4,
+            ),
+            (
+                (1, 4, 234, 234),
+                (4, 1, 3, 3),
+                (4,),
+                [2, 2],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                4,
+            ),
         ]
     )
     return test_suite
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl
index ab55d5beea..d26f1a805c 100644
--- a/backends/vulkan/test/op_tests/targets.bzl
+++ b/backends/vulkan/test/op_tests/targets.bzl
@@ -3,6 +3,44 @@ load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_pt_ops_deps")
 load("@fbsource//xplat/caffe2:pt_ops.bzl", "pt_operator_library")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
+def define_test_targets(test_name, extra_deps = [], src_file = None, is_fbcode = False):
+    deps_list = [
+        "//third-party/googletest:gtest_main",
+        "//executorch/backends/vulkan:vulkan_graph_runtime",
+        runtime.external_dep_location("libtorch"),
+    ] + extra_deps
+
+    src_file_str = src_file if src_file else "{}.cpp".format(test_name)
+
+    runtime.cxx_binary(
+        name = "{}_bin".format(test_name),
+        srcs = [
+            src_file_str,
+        ],
+        compiler_flags = [
+            "-Wno-unused-variable",
+        ],
+        define_static_target = False,
+        deps = deps_list,
+    )
+
+    runtime.cxx_test(
+        name = test_name,
+        srcs = [
+            src_file_str,
+        ],
+        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
+        fbandroid_additional_loaded_sonames = [
+            "torch-code-gen",
+            "vulkan_graph_runtime",
+            "vulkan_graph_runtime_shaderlib",
+        ],
+        platforms = [ANDROID],
+        use_instrumentation_test = True,
+        deps = deps_list,
+    )
+
+
 def define_common_targets(is_fbcode = False):
     if is_fbcode:
         return
@@ -82,19 +120,6 @@ def define_common_targets(is_fbcode = False):
         default_outs = ["."],
     )
 
-    runtime.cxx_binary(
-        name = "compute_graph_op_tests_bin",
-        srcs = [
-            ":generated_op_correctness_tests_cpp[op_tests.cpp]",
-        ],
-        define_static_target = False,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
-        ],
-    )
-
     runtime.cxx_binary(
         name = "compute_graph_op_benchmarks_bin",
         srcs = [
@@ -111,135 +136,17 @@ def define_common_targets(is_fbcode = False):
         ],
     )
 
-    runtime.cxx_test(
-        name = "compute_graph_op_tests",
-        srcs = [
-            ":generated_op_correctness_tests_cpp[op_tests.cpp]",
-        ],
-        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
-        fbandroid_additional_loaded_sonames = [
-            "torch-code-gen",
-            "vulkan_graph_runtime",
-            "vulkan_graph_runtime_shaderlib",
-        ],
-        platforms = [ANDROID],
-        use_instrumentation_test = True,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
-        ],
+    define_test_targets(
+        "compute_graph_op_tests",
+        src_file=":generated_op_correctness_tests_cpp[op_tests.cpp]"
     )
 
-    runtime.cxx_binary(
-        name = "sdpa_test_bin",
-        srcs = [
-            "sdpa_test.cpp",
-        ],
-        compiler_flags = [
-            "-Wno-unused-variable",
-        ],
-        define_static_target = False,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
-        ],
-    )
-
-    runtime.cxx_test(
-        name = "sdpa_test",
-        srcs = [
-            "sdpa_test.cpp",
-        ],
-        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
-        fbandroid_additional_loaded_sonames = [
-            "torch-code-gen",
-            "vulkan_graph_runtime",
-            "vulkan_graph_runtime_shaderlib",
-        ],
-        platforms = [ANDROID],
-        use_instrumentation_test = True,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
-            "//executorch/extension/tensor:tensor",
-            runtime.external_dep_location("libtorch"),
-        ],
-    )
-
-    runtime.cxx_binary(
-        name = "linear_weight_int4_test_bin",
-        srcs = [
-            "linear_weight_int4_test.cpp",
-        ],
-        compiler_flags = [
-            "-Wno-unused-variable",
-        ],
-        define_static_target = False,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
-        ],
-    )
-
-    runtime.cxx_test(
-        name = "linear_weight_int4_test",
-        srcs = [
-            "linear_weight_int4_test.cpp",
-        ],
-        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
-        fbandroid_additional_loaded_sonames = [
-            "torch-code-gen",
-            "vulkan_graph_runtime",
-            "vulkan_graph_runtime_shaderlib",
-        ],
-        platforms = [ANDROID],
-        use_instrumentation_test = True,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
+    define_test_targets(
+        "sdpa_test",
+        extra_deps = [
             "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
             "//executorch/extension/tensor:tensor",
-            runtime.external_dep_location("libtorch"),
-        ],
-    )
-
-    runtime.cxx_binary(
-        name = "rotary_embedding_test_bin",
-        srcs = [
-            "rotary_embedding_test.cpp",
-        ],
-        compiler_flags = [
-            "-Wno-unused-variable",
-        ],
-        define_static_target = False,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
-        ],
-    )
-
-    runtime.cxx_test(
-        name = "rotary_embedding_test",
-        srcs = [
-            "rotary_embedding_test.cpp",
-        ],
-        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
-        fbandroid_additional_loaded_sonames = [
-            "torch-code-gen",
-            "vulkan_graph_runtime",
-            "vulkan_graph_runtime_shaderlib",
-        ],
-        platforms = [ANDROID],
-        use_instrumentation_test = True,
-        deps = [
-            "//third-party/googletest:gtest_main",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            "//executorch/extension/tensor:tensor",
-            runtime.external_dep_location("libtorch"),
-        ],
+        ]
     )
+    define_test_targets("linear_weight_int4_test")
+    define_test_targets("rotary_embedding_test")
diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
index 472127ffe2..6f93e66207 100644
--- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py
+++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
@@ -667,7 +667,6 @@ def gen_op_check_fn(self) -> str:
             op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n"
 
         op_check_fn_body = ""
-        op_check_fn_body += self.gen_conditional_skips()
         op_check_fn_body += self.gen_graph_build_code()
         op_check_fn_body += self.gen_graph_exec_code()
 
diff --git a/backends/xnnpack/operators/op_squeeze.py b/backends/xnnpack/operators/op_squeeze.py
index 8ed5aa36ae..7a21fe9e55 100644
--- a/backends/xnnpack/operators/op_squeeze.py
+++ b/backends/xnnpack/operators/op_squeeze.py
@@ -16,7 +16,9 @@
     XNNStaticReshape,
     XNode,
 )
+
 from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node
+from torch.fx.experimental.symbolic_shapes import free_symbols
 
 
 @register_node_visitor
@@ -57,7 +59,7 @@ def define_node(
 
         num_dynamic_dims = 0
         for dim in dynamic_shape:
-            if isinstance(dim, torch.SymInt):
+            if free_symbols(dim):
                 num_dynamic_dims += 1
                 new_shape.append(0)
             else:
@@ -119,7 +121,7 @@ def define_node(
 
         num_dynamic_dims = 0
         for dim in dynamic_shape:
-            if isinstance(dim, torch.SymInt):
+            if free_symbols(dim):
                 num_dynamic_dims += 1
                 new_shape.append(0)
             else:
diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
index 5e43a63c76..acfede66e6 100644
--- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md
@@ -59,7 +59,7 @@ This example is verified with SM8550 and SM8450.
    - Click the "Get Software" button to download a version of QNN SDK.
    - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
    - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
-   - [QNN 2.26.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip)
+   - [QNN 2.28.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 133f9ec50b..7ed768baf2 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -9,7 +9,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 - Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch.
 - A Qualcomm device with 16GB RAM
   - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices.
-- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above.
+- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.28.0 or above.
 
 ## Instructions
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 2de1e713c9..1208d79b06 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -263,11 +263,7 @@ def get_compile_spec(
 ) -> ArmCompileSpecBuilder:
     spec_builder = None
     if target == "TOSA":
-        spec_builder = (
-            ArmCompileSpecBuilder()
-            .tosa_compile_spec("TOSA-0.80+BI")
-            .set_permute_memory_format(True)
-        )
+        spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI")
     elif "ethos-u55" in target:
         spec_builder = (
             ArmCompileSpecBuilder()
@@ -277,7 +273,6 @@ def get_compile_spec(
                 memory_mode=memory_mode,
                 extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
             )
-            .set_permute_memory_format(True)
             .set_quantize_io(True)
             .set_input_order(reorder_inputs)
         )
@@ -290,7 +285,6 @@ def get_compile_spec(
                 memory_mode=memory_mode,
                 extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
             )
-            .set_permute_memory_format(True)
             .set_quantize_io(True)
             .set_input_order(reorder_inputs)
         )
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index bf922360fd..5498bd7897 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -2,7 +2,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -91,8 +91,8 @@ tosa_reference_model_url="https://review.mlplatform.org/tosa/reference_model"
 tosa_reference_model_rev="v0.80.1"
 
 # vela
-vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela"
-vela_rev="5427dc7e9c1a4c7d554163290faeea75f168772d"
+vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela"
+vela_rev="fc970e3da72e5f6930b840b357684126602b3126"
 
 ########
 ### Mandatory user args
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
index 50d1e73fd7..9487c59a84 100644
--- a/exir/backend/utils.py
+++ b/exir/backend/utils.py
@@ -23,6 +23,7 @@
 
 from executorch.exir.lowered_backend_module import create_submodule_from_nodes
 from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
+from torch.fx.experimental.symbolic_shapes import has_free_symbols
 from torch.fx.node import Node
 from torch.fx.passes.utils.source_matcher_utils import SourcePartition
 
@@ -424,10 +425,7 @@ def is_shape_dynamic(node: torch.fx.Node) -> bool:
     Check if the node shape is dynamic.
     """
 
-    # Shape is dynamic if any of the dimensions don't evaluate to a static value
-    return "val" in node.meta and any(
-        isinstance(d, torch.SymInt) for d in node.meta["val"].shape
-    )
+    return has_free_symbols(node.meta["val"].shape)
 
 
 # TODO - style: use templated types
diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py
index 6cd05b4bf6..3ecf0b2b4b 100644
--- a/extension/llm/modules/test/test_attention.py
+++ b/extension/llm/modules/test/test_attention.py
@@ -33,6 +33,7 @@ def setUp(self):
         self.num_kv_heads = 8
         self.head_dim = 64
         self.max_seq_len = 128
+        self.encoder_max_seq_len = 128
         self.rope_base = 500_000
         self.scale_factor = 32
 
@@ -86,16 +87,26 @@ def setUp(self):
             max_seq_len=self.max_seq_len,
         )
         self.et_mha.load_state_dict(self.tt_mha.state_dict())
+
         # Common inputs.
         seq_len = 10
         self.x = torch.randn(1, seq_len, self.embed_dim)
+        self.y = torch.randn(1, seq_len, self.embed_dim)
         self.input_pos = torch.arange(seq_len).unsqueeze(0)  # shape [1, seq_len]
-        seq_len_dim = torch.export.Dim("seq_len", min=1, max=100)
-        self.dynamic_shapes = (
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim},
-        )
+        self.seq_len_dim = torch.export.Dim("seq_len", min=1, max=self.max_seq_len)
+        self.dynamic_shapes = {
+            "x": {
+                0: torch.export.Dim.STATIC,
+                1: self.seq_len_dim,
+                2: torch.export.Dim.STATIC,
+            },
+            "y": {
+                0: torch.export.Dim.STATIC,
+                1: self.seq_len_dim,
+                2: torch.export.Dim.STATIC,
+            },
+            "input_pos": {0: torch.export.Dim.STATIC, 1: self.seq_len_dim},
+        }
         self.causal_mask = torch.tril(
             torch.ones(
                 size=(self.max_seq_len, self.max_seq_len),
@@ -110,8 +121,8 @@ def test_attention_eager(self):
         assert_close(et_res, tt_res)
 
         # test with kv cache
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=20)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=20)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
         et_res = self.et_mha(self.x, self.x)  # Self attention.
         tt_res = self.tt_mha(self.x, self.x)  # Self attention.
@@ -144,12 +155,12 @@ def test_attention_export(self):
         # Self attention.
 
         # test with kv cache
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         with torch.no_grad():
             et_mha_ep = torch.export.export(
                 self.et_mha,
-                (self.x, self.x),
+                (self.x, self.y),
                 kwargs={"input_pos": self.input_pos},
                 dynamic_shapes=self.dynamic_shapes,
                 strict=True,
@@ -166,8 +177,8 @@ def test_attention_aoti(self):
         # Self attention.
 
         # test with kv cache
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         with torch.no_grad():
             so = torch._export.aot_compile(
                 self.et_mha,
@@ -189,13 +200,13 @@ def test_attention_aoti(self):
 
     def test_attention_executorch(self):
         # Self attention.
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
         with torch.no_grad():
             et_mha_ep = torch.export.export(
                 self.et_mha,
-                (self.x, self.x),
+                (self.x, self.y),
                 kwargs={"input_pos": self.input_pos},
                 dynamic_shapes=self.dynamic_shapes,
                 strict=True,
@@ -222,22 +233,18 @@ def test_attention_executorch(self):
 
     def test_attention_torch_cond_eager(self):
         # Different from vanilla torchtune MHA, we rewrite the if condition with torch.cond. We need to make sure they are giving the same results regarding the if condition.
-        # For the first run of MHA we provide `y` (self.x) but for the second run it will be a tensor full of nan.
+        # For the first run of MHA we provide `y` but for the second run it will be a tensor full of nan.
         self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
         mask = self.causal_mask[self.input_pos, :]
         # First run.
-        et_res = self.et_mha(
-            self.x, self.x, mask=mask, input_pos=self.input_pos
-        )  # Self attention with input pos.
-        tt_res = self.tt_mha(
-            self.x, self.x, mask=mask, input_pos=self.input_pos
-        )  # Self attention with input pos.
+        et_res = self.et_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
+        tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
 
         assert_close(et_res, tt_res)
 
-        # Second run test kv cache read. Input pos is [10, 11, ..., 19]
+        # Second run tests kv cache read. Input pos is [10, 11, ..., 19]
         next_input_pos = torch.arange(10, 20).unsqueeze(0)
 
         empty_y = torch.full_like(self.x, torch.nan)
@@ -246,3 +253,101 @@ def test_attention_torch_cond_eager(self):
         tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
 
         assert_close(et_res, tt_res)
+
+    def test_attention_torch_cond_export(self):
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        mask = self.causal_mask[self.input_pos, :]
+        dynamic_shapes = {
+            **self.dynamic_shapes,
+            **{
+                "mask": {
+                    0: torch.export.Dim.STATIC,
+                    1: self.seq_len_dim,
+                    2: torch.export.Dim.STATIC,
+                }
+            },
+        }
+        with torch.no_grad():
+            et_mha_ep = torch.export.export(
+                self.et_mha,
+                (self.x, self.y),
+                kwargs={
+                    "mask": mask,
+                    "input_pos": self.input_pos,
+                },
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+            )
+
+        # First run.
+        et_res = et_mha_ep.module()(self.x, self.y, mask=mask, input_pos=self.input_pos)
+        tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
+
+        assert_close(et_res, tt_res)
+
+        # Second run tests kv cache read. Input pos is [10, 11, ..., 19]
+        next_input_pos = torch.arange(10, 20).unsqueeze(0)
+        empty_y = torch.full_like(self.y, torch.nan)
+        mask = self.causal_mask[next_input_pos, :]
+        et_res = et_mha_ep.module()(
+            self.x, empty_y, mask=mask, input_pos=next_input_pos
+        )
+        tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
+
+        assert_close(et_res, tt_res)
+
+    def test_attention_torch_cond_executorch(self):
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        mask = self.causal_mask[self.input_pos, :]
+        dynamic_shapes = {
+            **self.dynamic_shapes,
+            **{
+                "mask": {
+                    0: torch.export.Dim.STATIC,
+                    1: self.seq_len_dim,
+                    2: torch.export.Dim.STATIC,
+                }
+            },
+        }
+        with torch.no_grad():
+            et_mha_ep = torch.export.export(
+                self.et_mha,
+                (self.x, self.y),
+                kwargs={
+                    "mask": mask,
+                    "input_pos": self.input_pos,
+                },
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+            )
+        et_program = to_edge(
+            et_mha_ep,
+            compile_config=EdgeCompileConfig(
+                _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg],
+                _check_ir_validity=False,
+            ),
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                passes=[InitializedMutableBufferPass(["cache_pos"])],
+            )
+        )
+
+        # First run.
+        runtime = Runtime.get()
+        program = runtime.load_program(et_program.buffer)
+        method = program.load_method("forward")
+        et_res = method.execute((self.x, self.y, mask, self.input_pos))
+        tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
+
+        assert_close(et_res[0], tt_res)
+
+        # Second run tests kv cache read. Input pos is [10, 11, ..., 19]
+        next_input_pos = torch.arange(10, 20).unsqueeze(0)
+        empty_y = torch.full_like(self.y, torch.nan)
+        mask = self.causal_mask[next_input_pos, :]
+        et_res = method.execute((self.x, empty_y, mask, next_input_pos))
+        tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
+
+        assert_close(et_res[0], tt_res)
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index 44676f2c23..833b37cfac 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -257,6 +257,8 @@
 
 - op: mean.out
 
+- op: mean.dtype_out
+
 - op: min.dim_min
 
 - op: min.unary_out
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index aeb0d7f8ca..6730404dde 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -66,6 +66,14 @@ Tensor& mean_dim_out(
   return out;
 }
 
+Tensor& mean_dtype_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  return mean_dim_out(ctx, in, ArrayRef<int64_t>(), false, dtype, out);
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
index 08237e07b9..884c10b813 100644
--- a/kernels/portable/cpu/util/reduce_util.cpp
+++ b/kernels/portable/cpu/util/reduce_util.cpp
@@ -386,6 +386,7 @@ bool check_mean_dim_args(
       check_reduction_args(in, dim_list, keepdim, dtype, out));
 
   if (dtype) {
+    ET_LOG(Info, "dtype is %hhd", static_cast<int8_t>(dtype.value()));
     ET_LOG_AND_RETURN_IF_FALSE(torch::executor::isFloatingType(dtype.value()));
     ET_LOG_AND_RETURN_IF_FALSE(out.scalar_type() == dtype.value());
   } else {
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 96382eb497..3221b8fe34 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -577,6 +577,11 @@
     - arg_meta: null
       kernel_name: torch::executor::mean_dim_out
 
+- op: mean.dtype_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mean_dtype_out
+
 - op: min.dim_min
   kernels:
     - arg_meta: null
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
index 9821cb6b47..c5ba00b20e 100644
--- a/kernels/test/op_mean_test.cpp
+++ b/kernels/test/op_mean_test.cpp
@@ -9,7 +9,7 @@
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -22,6 +22,7 @@ using exec_aten::ArrayRef;
 using exec_aten::optional;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::Error;
 using torch::executor::testing::TensorFactory;
 
 class OpMeanOutTest : public OperatorTest {
@@ -36,6 +37,13 @@ class OpMeanOutTest : public OperatorTest {
         context_, self, dim, keepdim, dtype, out);
   }
 
+  Tensor& op_mean_dtype_out(
+      const Tensor& self,
+      optional<ScalarType> dtype,
+      Tensor& out) {
+    return torch::executor::aten::mean_outf(context_, self, dtype, out);
+  }
+
   template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
   void test_mean_dim_out_invalid_dimensions() {
     TensorFactory<IN_DTYPE> tf_in;
@@ -466,3 +474,68 @@ TEST_F(OpMeanOutTest, DynamicShapeUnbound) {
       op_mean_out(x, ArrayRef<int64_t>{1}, false, ScalarType::Float, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
+
+TEST_F(OpMeanOutTest, DTypeOutFloatValid) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {10, 10},
+      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  Tensor expected_result = tf.make({}, {1.0});
+
+  Tensor out = tf.zeros({});
+  Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpMeanOutTest, DTypeOutFloatToBoolInvalid) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {10, 10},
+      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  Tensor expected_result = tf.make({}, {1.0});
+
+  Tensor out = tf.zeros({});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_mean_dtype_out(x, ScalarType::Bool, out));
+}
+
+TEST_F(OpMeanOutTest, DTypeOutFloatInfinity) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make({2, 1}, {INFINITY, INFINITY});
+  Tensor expected_result = tf.make({}, {INFINITY});
+
+  Tensor out = tf.zeros({});
+
+  Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpMeanOutTest, DTypeOutFloatNAN) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make({2, 1}, {NAN, INFINITY});
+  Tensor expected_result = tf.make({}, {NAN});
+
+  Tensor out = tf.zeros({});
+
+  Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index db9417dd88..674af6d69f 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -313,6 +313,8 @@ Error Method::parse_values() {
         "Null value at index %zu",
         i);
 
+    const auto val = serialization_value->val();
+
     switch (serialization_value->val_type()) {
       case executorch_flatbuffer::KernelTypes::Null: {
         // Placement new as the list elements are not initialized, so calling
@@ -321,18 +323,21 @@ Error Method::parse_values() {
         new (&values_[i]) EValue();
       } break;
       case executorch_flatbuffer::KernelTypes::Int: {
-        new (&values_[i]) EValue(serialization_value->val_as_Int()->int_val());
+        new (&values_[i]) EValue(
+            static_cast<const executorch_flatbuffer::Int*>(val)->int_val());
       } break;
       case executorch_flatbuffer::KernelTypes::Double: {
         new (&values_[i])
-            EValue(serialization_value->val_as_Double()->double_val());
+            EValue(static_cast<const executorch_flatbuffer::Double*>(val)
+                       ->double_val());
       } break;
       case executorch_flatbuffer::KernelTypes::Bool: {
-        new (&values_[i])
-            EValue(serialization_value->val_as_Bool()->bool_val());
+        new (&values_[i]) EValue(
+            static_cast<const executorch_flatbuffer::Bool*>(val)->bool_val());
       } break;
       case executorch_flatbuffer::KernelTypes::IntList: {
-        const auto items = serialization_value->val_as_IntList()->items();
+        const auto items =
+            static_cast<const executorch_flatbuffer::IntList*>(val)->items();
         ET_CHECK_OR_RETURN_ERROR(
             items != nullptr, InvalidProgram, "Missing list at index %zu", i);
         // Allocate space for boxed and unboxed list representations using
@@ -352,7 +357,8 @@ Error Method::parse_values() {
             BoxedEvalueList<int64_t>(evalp_list, int_list, items->size()));
       } break;
       case executorch_flatbuffer::KernelTypes::BoolList: {
-        const auto items = serialization_value->val_as_BoolList()->items();
+        const auto items =
+            static_cast<const executorch_flatbuffer::BoolList*>(val)->items();
         ET_CHECK_OR_RETURN_ERROR(
             items != nullptr, InvalidProgram, "Missing list at index %zu", i);
         // NOTE: This is technically not portable. A platform could technically
@@ -366,14 +372,17 @@ Error Method::parse_values() {
             (const bool*)items->data(), items->size()));
       } break;
       case executorch_flatbuffer::KernelTypes::DoubleList: {
-        const auto items = serialization_value->val_as_DoubleList()->items();
+        const auto items =
+            static_cast<const executorch_flatbuffer::DoubleList*>(val)->items();
         ET_CHECK_OR_RETURN_ERROR(
             items != nullptr, InvalidProgram, "Missing list at index %zu", i);
         new (&values_[i])
             EValue(exec_aten::ArrayRef<double>(items->data(), items->size()));
       } break;
       case executorch_flatbuffer::KernelTypes::String: {
-        const auto fb_str = serialization_value->val_as_String()->string_val();
+        const auto fb_str =
+            static_cast<const executorch_flatbuffer::String*>(val)
+                ->string_val();
         ET_CHECK_OR_RETURN_ERROR(
             fb_str != nullptr,
             InvalidProgram,
@@ -383,7 +392,9 @@ Error Method::parse_values() {
       } break;
       case executorch_flatbuffer::KernelTypes::Tensor: {
         auto t = deserialization::parseTensor(
-            program_, memory_manager_, serialization_value->val_as_Tensor());
+            program_,
+            memory_manager_,
+            static_cast<const executorch_flatbuffer::Tensor*>(val));
         if (!t.ok()) {
           ET_LOG(
               Error,
@@ -398,7 +409,7 @@ Error Method::parse_values() {
         // get list of serialization tensors and allocate storage for executor
         // tensors
         auto tensors = deserialization::parseTensorList(
-            serialization_value->val_as_TensorList()->items(),
+            static_cast<const executorch_flatbuffer::TensorList*>(val)->items(),
             values_,
             memory_manager_);
         if (!tensors.ok()) {
@@ -415,7 +426,9 @@ Error Method::parse_values() {
         // Same as TensorList but optional<Tensor> instead of Tensor
         auto tensors =
             deserialization::parseListOptionalType<exec_aten::Tensor>(
-                serialization_value->val_as_OptionalTensorList()->items(),
+                static_cast<const executorch_flatbuffer::OptionalTensorList*>(
+                    val)
+                    ->items(),
                 values_,
                 memory_manager_);
         if (!tensors.ok()) {
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
index f43398d0ab..5be486b4d8 100644
--- a/runtime/executor/method_meta.cpp
+++ b/runtime/executor/method_meta.cpp
@@ -116,6 +116,14 @@ Result<Tag> MethodMeta::input_tag(size_t index) const {
       index,
       num_inputs);
   auto input_index = s_plan_->inputs()->Get(index);
+  size_t num_values = s_plan_->values()->size();
+  ET_CHECK_OR_RETURN_ERROR(
+      input_index >= 0 && input_index < num_values,
+      InvalidProgram,
+      "internal value index %d out of range [0,%zu) for input %zu",
+      input_index,
+      num_values,
+      index);
   auto serialization_value = s_plan_->values()->Get(input_index);
   return get_tag(serialization_value, index);
 }
@@ -132,6 +140,7 @@ Result<TensorInfo> MethodMeta::input_tensor_meta(size_t index) const {
       (size_t)tag.get(),
       index);
   auto input_index = s_plan_->inputs()->Get(index);
+  // input_index was already validated by input_tag().
   auto tensor_value = s_plan_->values()->Get(input_index)->val_as_Tensor();
   return TensorInfo(
       Span<const int32_t>(
@@ -156,8 +165,16 @@ Result<Tag> MethodMeta::output_tag(size_t index) const {
       "index %zu out of range. num_outputs: %zu",
       index,
       num_outputs);
-  auto input_index = s_plan_->outputs()->Get(index);
-  auto serialization_value = s_plan_->values()->Get(input_index);
+  auto output_index = s_plan_->outputs()->Get(index);
+  size_t num_values = s_plan_->values()->size();
+  ET_CHECK_OR_RETURN_ERROR(
+      output_index >= 0 && output_index < num_values,
+      InvalidProgram,
+      "internal value index %d out of range [0,%zu) for output %zu",
+      output_index,
+      num_values,
+      index);
+  auto serialization_value = s_plan_->values()->Get(output_index);
   return get_tag(serialization_value, index);
 }
 
@@ -173,6 +190,7 @@ Result<TensorInfo> MethodMeta::output_tensor_meta(size_t index) const {
       (size_t)tag.get(),
       index);
   auto output_index = s_plan_->outputs()->Get(index);
+  // output_index was already validated by output_tag().
   auto tensor_value = s_plan_->values()->Get(output_index)->val_as_Tensor();
 
   return TensorInfo(
diff --git a/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl b/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl
index 75019982af..5cb801489e 100644
--- a/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl
+++ b/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl
@@ -1,2 +1,2 @@
 def get_qnn_library_verision():
-    return "2.26"
+    return "2.28"
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl
index 46cdaebcb3..8e0e89eda5 100644
--- a/shim/xplat/executorch/codegen/codegen.bzl
+++ b/shim/xplat/executorch/codegen/codegen.bzl
@@ -397,11 +397,11 @@ def build_portable_lib(name, oplist_header_name, feature = None, expose_operator
     # Currently fbcode links all dependent libraries through shared
     # library, and it blocks users like unit tests to use kernel
     # implementation directly. So we enable this for xplat only.
-    compiler_flags = ["-Wno-missing-prototypes", "-fvisibility=hidden"]
-    if expose_operator_symbols:
+    compiler_flags = ["-Wno-missing-prototypes"]
+    if not expose_operator_symbols:
         # Removing '-fvisibility=hidden' exposes operator symbols.
         # This allows operators to be called outside of the kernel registry.
-        compiler_flags = ["-Wno-missing-prototypes"]
+        compiler_flags += ["-fvisibility=hidden"]
 
     # Build portable lib.
     runtime.cxx_library(
diff --git a/test/size_test.cpp b/test/size_test.cpp
index 88b605c3bf..1fab1e914e 100644
--- a/test/size_test.cpp
+++ b/test/size_test.cpp
@@ -94,7 +94,7 @@ int main(int argc, char** argv) {
   // It assumes the outputs are all tensors.
   for (size_t i = 0; i < method->outputs_size(); i++) {
     auto output_tensor = output_list[i].toTensor();
-    auto data_output = output_tensor.const_data_ptr<float>();
+    [[maybe_unused]] auto data_output = output_tensor.const_data_ptr<float>();
     for (size_t j = 0; j < output_list[i].toTensor().numel(); ++j) {
       ET_LOG(Info, "%f", data_output[j]);
     }