From 8a5f52b9e1ed082dc21aaf1d6b5f9c2646620add Mon Sep 17 00:00:00 2001
From: Daniil Lyakhov <daniil.lyakhov@intel.com>
Date: Tue, 14 Jan 2025 15:44:39 -0800
Subject: [PATCH 01/40] Typo in coreml README.md (#7586)

* Typo in coreml README.md

* int8 -> qint8, uint8 -> quint8
---
 backends/apple/coreml/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
index b3b22ed999..e8a062774d 100644
--- a/backends/apple/coreml/README.md
+++ b/backends/apple/coreml/README.md
@@ -93,14 +93,14 @@ class Model(torch.nn.Module):
 source_model = Model()
 example_inputs = (torch.randn((1, 3, 256, 256)), )
 
-pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
+pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
 
 quantization_config = LinearQuantizerConfig.from_dict(
     {
         "global_config": {
             "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.uint8,
-            "weight_dtype": torch.int8,
+            "activation_dtype": torch.quint8,
+            "weight_dtype": torch.qint8,
             "weight_per_channel": True,
         }
     }

From 7fa4b87adabe203d383ff4a9208ef2f94f47d676 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Tue, 14 Jan 2025 18:49:50 -0600
Subject: [PATCH 02/40] [executorch][flat_tensor] Serialize flat tensor (#7641)

Pull Request resolved: https://github.com/pytorch/executorch/pull/7268

Serialize a flat tensor file. The resulting file looks like:

Header containing:
- flatbuffer offset and size
- segment data offset and size

Flatbuffer containing:
- Items described in [flat_tensor.fbs](https://www.internalfb.com/code/fbsource/[079ba95593be856a16783bd3f3b3579580595fbb]/fbcode/executorch/extension/flat_tensor/flat_tensor.fbs)

Tensor data (in segment)
- Raw tensor data


ghstack-source-id: 261273078
@exported-using-ghexport

Differential Revision: [D66374253](https://our.internmc.facebook.com/intern/diff/D66374253/)

Co-authored-by: lucylq <lfq@meta.com>
---
 exir/_serialize/TARGETS                       |   1 +
 extension/flat_tensor/__init__.py             |   0
 extension/flat_tensor/serialize/TARGETS       |  18 ++
 extension/flat_tensor/serialize/__init__.py   |   0
 .../serialize/flat_tensor_schema.py           |   2 +-
 extension/flat_tensor/serialize/serialize.py  | 290 ++++++++++++++++++
 extension/flat_tensor/test/TARGETS            |  14 +
 extension/flat_tensor/test/test_serialize.py  |  85 +++++
 8 files changed, 409 insertions(+), 1 deletion(-)
 create mode 100644 extension/flat_tensor/__init__.py
 create mode 100644 extension/flat_tensor/serialize/__init__.py
 create mode 100644 extension/flat_tensor/serialize/serialize.py
 create mode 100644 extension/flat_tensor/test/TARGETS
 create mode 100644 extension/flat_tensor/test/test_serialize.py

diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 4ce219d950..cd6a4bc5a2 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -33,6 +33,7 @@ runtime.python_library(
         "_dataclass.py",
         "_flatbuffer.py",
         "_program.py",
+        "data_serializer.py",
         "padding.py",
     ],
     resources = {
diff --git a/extension/flat_tensor/__init__.py b/extension/flat_tensor/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/extension/flat_tensor/serialize/TARGETS b/extension/flat_tensor/serialize/TARGETS
index c3acdca054..229f6930f4 100644
--- a/extension/flat_tensor/serialize/TARGETS
+++ b/extension/flat_tensor/serialize/TARGETS
@@ -14,3 +14,21 @@ runtime.python_library(
         "//executorch/...",
     ],
 )
+
+runtime.python_library(
+    name = "serialize",
+    srcs = [
+        "serialize.py",
+    ],
+    resources = [
+        "flat_tensor.fbs",
+        "scalar_type.fbs",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        ":schema",
+        "//executorch/exir/_serialize:lib",
+    ],
+)
diff --git a/extension/flat_tensor/serialize/__init__.py b/extension/flat_tensor/serialize/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/extension/flat_tensor/serialize/flat_tensor_schema.py b/extension/flat_tensor/serialize/flat_tensor_schema.py
index 091ce1178b..818963d05b 100644
--- a/extension/flat_tensor/serialize/flat_tensor_schema.py
+++ b/extension/flat_tensor/serialize/flat_tensor_schema.py
@@ -18,7 +18,7 @@ class TensorMetadata:
     fully_qualified_name: str
     scalar_type: ScalarType
     sizes: List[int]
-    dim_order: List[bytes]
+    dim_order: List[int]
 
     segment_index: int
     offset: int
diff --git a/extension/flat_tensor/serialize/serialize.py b/extension/flat_tensor/serialize/serialize.py
new file mode 100644
index 0000000000..9e3df6aafc
--- /dev/null
+++ b/extension/flat_tensor/serialize/serialize.py
@@ -0,0 +1,290 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import json
+import os
+import tempfile
+from dataclasses import dataclass
+from typing import ClassVar, Dict, List, Literal, Optional
+
+import pkg_resources
+from executorch.exir._serialize._cord import Cord
+from executorch.exir._serialize._dataclass import _DataclassEncoder
+
+from executorch.exir._serialize._flatbuffer import _flatc_compile
+from executorch.exir._serialize.data_serializer import DataPayload, DataSerializer
+
+from executorch.exir._serialize.padding import aligned_size, pad_to, padding_required
+
+# Byte order of numbers written to flat tensor headers. Always little-endian
+# regardless of the host system, since all commonly-used modern CPUs are little
+# endian.
+_HEADER_BYTEORDER: Literal["little"] = "little"
+
+from executorch.extension.flat_tensor.serialize.flat_tensor_schema import (
+    DataSegment,
+    FlatTensor,
+    TensorMetadata,
+)
+
+
+def _convert_to_flatbuffer(flat_tensor: FlatTensor) -> Cord:
+    """Converts a FlatTensor to a flatbuffer and returns the serialized data."""
+    flat_tensor_json = json.dumps(flat_tensor, cls=_DataclassEncoder)
+    with tempfile.TemporaryDirectory() as d:
+        schema_path = os.path.join(d, "flat_tensor.fbs")
+        with open(schema_path, "wb") as schema_file:
+            schema_file.write(
+                pkg_resources.resource_string(__name__, "flat_tensor.fbs")
+            )
+        scalar_type_path = os.path.join(d, "scalar_type.fbs")
+        with open(scalar_type_path, "wb") as scalar_type_file:
+            scalar_type_file.write(
+                pkg_resources.resource_string(__name__, "scalar_type.fbs")
+            )
+        json_path = os.path.join(d, "flat_tensor.json")
+        with open(json_path, "wb") as json_file:
+            json_file.write(flat_tensor_json.encode("ascii"))
+
+        _flatc_compile(d, schema_path, json_path)
+        output_path = os.path.join(d, "flat_tensor.ptd")
+        with open(output_path, "rb") as output_file:
+            return Cord(output_file.read())
+
+
+@dataclass
+class FlatTensorConfig:
+    tensor_alignment: int = 16
+    segment_alignment: int = 16
+
+
+@dataclass
+class FlatTensorHeader:
+    # Class constants.
+    # The magic bytes that should be at the beginning of the header.
+    EXPECTED_MAGIC: ClassVar[bytes] = b"FH01"
+    EXPECTED_LENGTH: ClassVar[int] = (
+        # Header magic
+        4
+        # Header length
+        + 4
+        # Flatbuffer offset
+        + 8
+        # Flatbuffer data size
+        + 8
+        # Segment base offset
+        + 8
+        # Data size
+        + 8
+    )
+
+    # Instance attributes. @dataclass will turn these into ctor args.
+
+    # Offset to the start of the flatbuffer data, in bytes.
+    flatbuffer_offset: int
+    # The size of the serialized data in bytes.
+    flatbuffer_size: int
+    # Offset to the start of the first segment, or zero if there
+    # are no segments.
+    segment_base_offset: int
+    # Size of all the segment data, in bytes.
+    segment_data_size: int
+
+    # The magic bytes read from or to be written to the binary header.
+    magic: bytes = EXPECTED_MAGIC
+    # The header length, in bytes, read from or to be written to the binary
+    # header.
+    length: int = EXPECTED_LENGTH
+
+    @staticmethod
+    def from_bytes(data: bytes) -> "FlatTensorHeader":
+        """Tries to read an flat_tensor header from the provided data.
+
+        Does not validate that the header is well-formed. Callers should
+        use is_valid().
+
+        Args:
+            data: The data to read from.
+        Returns:
+            The contents of the flat_tensor header.
+        Raises:
+            ValueError: If not enough data is provided.
+        """
+        if len(data) < FlatTensorHeader.EXPECTED_LENGTH:
+            raise ValueError(
+                f"Not enough data for flat_tensor header: {len(data)} "
+                + f"< {FlatTensorHeader.EXPECTED_LENGTH}"
+            )
+
+        return FlatTensorHeader(
+            magic=data[0:4],
+            length=int.from_bytes(data[4:8], byteorder=_HEADER_BYTEORDER),
+            flatbuffer_offset=int.from_bytes(data[8:16], byteorder=_HEADER_BYTEORDER),
+            flatbuffer_size=int.from_bytes(data[16:24], byteorder=_HEADER_BYTEORDER),
+            segment_base_offset=int.from_bytes(
+                data[24:32], byteorder=_HEADER_BYTEORDER
+            ),
+            segment_data_size=int.from_bytes(data[32:40], byteorder=_HEADER_BYTEORDER),
+        )
+
+    def is_valid(self) -> bool:
+        """Returns true if the flat_tensor header appears to be well-formed."""
+        return (
+            self.magic == FlatTensorHeader.EXPECTED_MAGIC
+            and self.length >= FlatTensorHeader.EXPECTED_LENGTH
+        )
+
+    def to_bytes(self) -> bytes:
+        """Returns the binary representation of the flat_tensor header.
+
+        Note that this will ignore self.magic and self.length and will always
+        write the proper magic/length.
+        """
+        data: bytes = (
+            # Extended header magic. This lets consumers detect whether the
+            # header was inserted or not. Always use the proper magic value
+            # (i.e., ignore self.magic) since there's no reason to create an
+            # invalid header.
+            self.EXPECTED_MAGIC
+            # uint32_t: Size of this header. This makes it easier to add new
+            # fields to this header in the future. Always use the proper size
+            # (i.e., ignore self.length) since there's no reason to create an
+            # invalid header.
+            + self.EXPECTED_LENGTH.to_bytes(4, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Offset to the start of the flatbuffer data, in bytes.
+            + self.flatbuffer_offset.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Size of the serialized data in bytes.
+            + self.flatbuffer_size.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Offset to the start of the first segment, or zero if
+            # there are no segments.
+            + self.segment_base_offset.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+            # uint64_t: Size of all the segment data, in bytes.
+            + self.segment_data_size.to_bytes(8, byteorder=_HEADER_BYTEORDER)
+        )
+        return data
+
+
+class FlatTensorSerializer(DataSerializer):
+    """A concrete implementation of the DataSerializer interface that
+    serializes and deserializes data to/from the FlatTensor format.
+    """
+
+    def __init__(self, config: Optional[FlatTensorConfig] = None) -> None:
+        """FlatTensorConfig holds information required for serialization,
+        eg. alignment.
+        """
+        if config is None:
+            self.config: FlatTensorConfig = FlatTensorConfig()
+        else:
+            self.config: FlatTensorConfig = config
+
+    def serialize(
+        self,
+        data: DataPayload,
+    ) -> Cord:
+        """Serializes a list of tensor metadata and tensors into a blob."""
+
+        flat_tensor_metadata: List[TensorMetadata] = []
+        flat_tensor_data: Cord = Cord()
+
+        # {idx, offset}
+        saved_offsets: Dict[int, int] = {}
+
+        for fqn, tensor_entry in data.fqn_to_tensor.items():
+            assert tensor_entry.layout is not None
+            # Check index into the tensor buffers is valid.
+            assert tensor_entry.buffer_index < len(
+                data.buffers
+            ), f"Invalid index {tensor_entry.buffer_index} is greater than tensor buffer size {len(data.buffers)}."
+
+            # Check if the tensor has already been appended to the flat_tensor_data.
+            offset = saved_offsets.get(tensor_entry.buffer_index, -1)
+            if offset == -1:
+                if len(flat_tensor_data) > 0:
+                    # Add padding to round off the previous tensor offset.
+                    pad_length = padding_required(
+                        len(flat_tensor_data), self.config.tensor_alignment
+                    )
+                    flat_tensor_data.append(b"\x00" * pad_length)
+                # Add to saved offsets.
+                offset = len(flat_tensor_data)
+                saved_offsets[tensor_entry.buffer_index] = offset
+                # Append to flat_tensor_data at the offset.
+                flat_tensor_data.append(data.buffers[tensor_entry.buffer_index])
+
+            flat_tensor_metadata.append(
+                TensorMetadata(
+                    fully_qualified_name=fqn,
+                    scalar_type=tensor_entry.layout.scalar_type,
+                    sizes=tensor_entry.layout.sizes,
+                    dim_order=tensor_entry.layout.dim_order,
+                    segment_index=0,
+                    offset=offset,
+                )
+            )
+
+        # Pad flat_tensor_data to segment alignment.
+        segment_pad_length = padding_required(
+            len(flat_tensor_data), self.config.segment_alignment
+        )
+        if segment_pad_length > 0:
+            flat_tensor_data.append(b"\x00" * segment_pad_length)
+
+        # Create FlatTensor, which describes of the contents of the file and
+        # points to all the data segments. It will be serialized to flatbuffer.
+        flat_tensor = FlatTensor(
+            version=0,
+            tensor_alignment=self.config.tensor_alignment,
+            tensors=flat_tensor_metadata,
+            segments=[DataSegment(offset=0, size=len(flat_tensor_data))],
+        )
+
+        flatbuffer_payload = _convert_to_flatbuffer(flat_tensor)
+        padded_flatbuffer_length: int = aligned_size(
+            input_size=len(flatbuffer_payload),
+            alignment=self.config.tensor_alignment,
+        )
+
+        padded_header_length: int = aligned_size(
+            input_size=FlatTensorHeader.EXPECTED_LENGTH,
+            alignment=self.config.tensor_alignment,
+        )
+
+        segment_base_offset = aligned_size(
+            padded_flatbuffer_length + padded_header_length,
+            self.config.segment_alignment,
+        )
+
+        # Create FlatTensorHeader, which stores the offsets and sizes of the
+        # FlatTensor flatbuffer and the segment data.
+        header_data: bytes = FlatTensorHeader(
+            flatbuffer_offset=padded_header_length,
+            flatbuffer_size=len(flatbuffer_payload),
+            segment_base_offset=segment_base_offset,
+            segment_data_size=len(flat_tensor_data),
+        ).to_bytes()
+
+        # Pad header and payload to segment alignment.
+        header_data = pad_to(header_data, padded_header_length)
+        flatbuffer_payload.append(
+            b"\x00" * (padded_flatbuffer_length - len(flatbuffer_payload))
+        )
+
+        # Place everything into one segment.
+        payload = Cord()
+        payload.append(header_data)
+        payload.append(flatbuffer_payload)
+        payload.append(flat_tensor_data)
+
+        return payload
+
+    def deserialize(self, blob: Cord) -> DataPayload:
+        """
+        Deserializes a flat_tensor blob into a list of tensor metadata and tensors.
+        """
+        raise NotImplementedError("deserialize_data")
diff --git a/extension/flat_tensor/test/TARGETS b/extension/flat_tensor/test/TARGETS
new file mode 100644
index 0000000000..6f708ae848
--- /dev/null
+++ b/extension/flat_tensor/test/TARGETS
@@ -0,0 +1,14 @@
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("executorch")
+
+python_unittest(
+    name = "serialize",
+    srcs = [
+        "test_serialize.py",
+    ],
+    deps = [
+        "//executorch/extension/flat_tensor/serialize:serialize",
+        "//executorch/extension/flat_tensor/serialize:schema",
+    ],
+)
diff --git a/extension/flat_tensor/test/test_serialize.py b/extension/flat_tensor/test/test_serialize.py
new file mode 100644
index 0000000000..d023567274
--- /dev/null
+++ b/extension/flat_tensor/test/test_serialize.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+from executorch.exir._serialize.data_serializer import (
+    DataPayload,
+    DataSerializer,
+    TensorEntry,
+    TensorLayout,
+)
+
+from executorch.exir._serialize.padding import aligned_size
+
+from executorch.exir.schema import ScalarType
+
+from executorch.extension.flat_tensor.serialize.serialize import (
+    FlatTensorConfig,
+    FlatTensorHeader,
+    FlatTensorSerializer,
+)
+
+# Test artifacts.
+TEST_TENSOR_BUFFER = [b"tensor"]
+TEST_TENSOR_MAP = {
+    "fqn1": TensorEntry(
+        buffer_index=0,
+        layout=TensorLayout(
+            scalar_type=ScalarType.FLOAT,
+            sizes=[1, 1, 1],
+            dim_order=[0, 1, 2],
+        ),
+    ),
+    "fqn2": TensorEntry(
+        buffer_index=0,
+        layout=TensorLayout(
+            scalar_type=ScalarType.FLOAT,
+            sizes=[1, 1, 1],
+            dim_order=[0, 1, 2],
+        ),
+    ),
+}
+TEST_DATA_PAYLOAD = DataPayload(
+    buffers=TEST_TENSOR_BUFFER,
+    fqn_to_tensor=TEST_TENSOR_MAP,
+)
+
+
+class TestSerialize(unittest.TestCase):
+    def test_serialize(self) -> None:
+        config = FlatTensorConfig()
+        serializer: DataSerializer = FlatTensorSerializer(config)
+
+        data = bytes(serializer.serialize(TEST_DATA_PAYLOAD))
+
+        header = FlatTensorHeader.from_bytes(data[0 : FlatTensorHeader.EXPECTED_LENGTH])
+        self.assertTrue(header.is_valid())
+
+        # Header is aligned to config.segment_alignment, which is where the flatbuffer starts.
+        self.assertEqual(
+            header.flatbuffer_offset,
+            aligned_size(FlatTensorHeader.EXPECTED_LENGTH, config.segment_alignment),
+        )
+
+        # Flatbuffer is non-empty.
+        self.assertTrue(header.flatbuffer_size > 0)
+
+        # Segment base offset is aligned to config.segment_alignment.
+        expected_segment_base_offset = aligned_size(
+            header.flatbuffer_offset + header.flatbuffer_size, config.segment_alignment
+        )
+        self.assertTrue(header.segment_base_offset, expected_segment_base_offset)
+
+        # TEST_TENSOR_BUFFER is aligned to config.segment_alignment.
+        self.assertEqual(header.segment_data_size, config.segment_alignment)
+
+        # Confirm the flatbuffer magic is present.
+        self.assertEqual(
+            data[header.flatbuffer_offset + 4 : header.flatbuffer_offset + 8], b"FT01"
+        )

From 0b81bb693d9b43cd84a6728aa3b7ab376139ad03 Mon Sep 17 00:00:00 2001
From: Guang Yang <42389959+guangy10@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:55:50 -0800
Subject: [PATCH 03/40] Additional QNN version fix (#7664)

Co-authored-by: Guang Yang <guangyang@fb.com>
---
 .github/workflows/android-perf.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
index 1874a4fd6e..f2a289e230 100644
--- a/.github/workflows/android-perf.yml
+++ b/.github/workflows/android-perf.yml
@@ -260,7 +260,7 @@ jobs:
                       --output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
-                    export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+                    export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                     export PYTHONPATH=$(pwd)/..
 
@@ -347,7 +347,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         export ANDROID_ABIS="arm64-v8a"
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:

From d9992040744bb1a5da4cffb1727086b027766e5e Mon Sep 17 00:00:00 2001
From: lucylq <lfq@meta.com>
Date: Tue, 14 Jan 2025 19:05:33 -0800
Subject: [PATCH 04/40] fix-up (#7665)

---
 .../runtime/graph/ops/glsl/conv2d_dw.glsl     |  2 +-
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl |  2 +-
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     |  2 +-
 .../graph/ops/glsl/indexing_utils_u16.h       | 19 -------------------
 4 files changed, 3 insertions(+), 22 deletions(-)
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index 23cbb1b652..103f3cfdd7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -14,7 +14,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils_u16.h"
+#include "indexing_utils.h"
 
 layout(std430) buffer;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 48afd3a9a7..9e69fdd1fe 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -22,7 +22,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils_u16.h"
+#include "indexing_utils.h"
 
 layout(std430) buffer;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index b50a892cad..a5a2097cd5 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -16,7 +16,7 @@
 
 #define op(X, A, B) ${OPERATOR}
 
-#include "indexing_utils_u16.h"
+#include "indexing_utils.h"
 
 layout(std430) buffer;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h
deleted file mode 100644
index 6dc59b6303..0000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef INDEXING_UTILS_U16_H
-#define INDEXING_UTILS_U16_H
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-
-u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) {
-  const uint div_by_x = idx / size_x;
-  return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y);
-}
-
-#endif // INDEXING_UTILS_U16_H

From 24f0d34dcddf7e050b601b55e3e310f72568399a Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Tue, 14 Jan 2025 21:11:53 -0800
Subject: [PATCH 05/40] Log dtype names on input dtype mismatch (#7537)

Log dtype names on input dtype mismatch (#7537)

Summary:
Update the error message when input tensor scalar type is incorrect. We've seen this get hit a few times and it should be easier to debug than it is.

New Message:
```
[method.cpp:834] Input 0 has unexpected scalar type: expected Float but was Byte.
```
Old Message:
```
[method.cpp:826] The 0-th input tensor's scalartype does not meet requirement: found 0 but expected 6
```


Test Plan:
Built executorch bento kernel locally and tested with an incorrect scalar type to view the new error message.
```
[method.cpp:834] Input 0 has unexpected scalar type: expected Float but was Byte.
```
I also locally patched and built the bento kernel with ET_ENABLE_ENUM_STRINGS=0.
```
[method.cpp:834] Input 0 has unexpected scalar type: expected 6 but was 0.
```

Differential Revision: D67887770

Pulled By: GregoryComer
---
 runtime/executor/method.cpp  | 8 ++++----
 runtime/executor/targets.bzl | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index c9563d3ae5..c539613e6b 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -840,14 +840,14 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) {
   if (e.isTensor()) {
     const auto& t_dst = e.toTensor();
     const auto& t_src = input_evalue.toTensor();
+
     ET_CHECK_OR_RETURN_ERROR(
         t_dst.scalar_type() == t_src.scalar_type(),
         InvalidArgument,
-        "The %zu-th input tensor's scalartype does not meet requirement: found %" PRId8
-        " but expected %" PRId8,
+        "Input %zu has unexpected scalar type: expected %s but was %s.",
         input_idx,
-        static_cast<int8_t>(t_src.scalar_type()),
-        static_cast<int8_t>(t_dst.scalar_type()));
+        executorch::runtime::toString(t_dst.scalar_type()),
+        executorch::runtime::toString(t_src.scalar_type()));
     // Reset the shape for the Method's input as the size of forwarded input
     // tensor for shape dynamism. Also is a safety check if need memcpy.
     Error err = resize_tensor(t_dst, t_src.sizes());
diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl
index cc91255d7b..158da5d108 100644
--- a/runtime/executor/targets.bzl
+++ b/runtime/executor/targets.bzl
@@ -82,6 +82,7 @@ def define_common_targets():
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/runtime/core:event_tracer" + aten_suffix,
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix,
                 "//executorch/runtime/kernel:operator_registry",

From 3ef100dd7c0099ad9c81e0d75571f065d7d46adf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Per=20=C3=85strand?= <per@users.noreply.github.com>
Date: Wed, 15 Jan 2025 09:34:06 +0100
Subject: [PATCH 06/40] Use ArmQuantizer to quantize bias (#7649)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the 'manual' quantization of bias parameter and let the
quantizer handle the quantization instead.

Signed-off-by: Per Åstrand <per.astrand@arm.com>
---
 backends/arm/process_node.py                  | 66 +++----------------
 .../arm/quantizer/quantization_annotator.py   |  4 +-
 backends/arm/quantizer/quantization_config.py | 40 ++++++++++-
 backends/arm/test/misc/test_debug_feats.py    |  4 +-
 backends/arm/tosa_utils.py                    | 15 +----
 5 files changed, 51 insertions(+), 78 deletions(-)

diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index 6aa663b81e..9ab9c49044 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -11,11 +11,6 @@
 import serializer.tosa_serializer as ts
 import torch
 import torch.fx
-
-# pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
 from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
 from executorch.backends.arm.tosa_quant_utils import (
@@ -24,11 +19,7 @@
     is_node_quantized,
 )
 from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.backends.arm.tosa_utils import (
-    getNodeArgs,
-    is_bias_node_for_quantized_conv,
-    tosa_shape,
-)
+from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape
 from torch.export.exported_program import ExportedProgram
 
 
@@ -99,41 +90,6 @@ def process_inputs(
     tosa_graph.addInputTensor(tensor)
 
 
-def process_quantized_bias(
-    node: torch.fx.Node,
-    tosa_graph: ts.TosaSerializer,
-    parameter_values,
-):
-    """
-    Serialize bias node that needs to be quantized.
-    """
-    consumer_node = list(node.users)[0]
-    (
-        input_node,
-        weight_node,
-        _,
-    ) = consumer_node.all_input_nodes
-
-    input_qargs = get_input_qparams(  # pyre-ignore[16]: Module `executorch.backends.arm` has no attribute `_passes`.
-        consumer_node
-    )
-
-    input_node_scale = input_qargs[0].scale
-    weight_node_scale = input_qargs[1].scale
-    bias_values_quantized = (
-        (parameter_values / (input_node_scale * weight_node_scale))
-        .round()
-        .astype(np.int32)
-    )
-
-    tosa_graph.addConst(
-        bias_values_quantized.shape,
-        ts.DType.INT32,
-        bias_values_quantized,
-        name=node.name,
-    )
-
-
 def process_inputs_to_parameters(
     node: torch.fx.Node,
     tosa_graph: ts.TosaSerializer,
@@ -148,20 +104,14 @@ def process_inputs_to_parameters(
     assert isinstance(parameter_data, torch.Tensor), "Expect Attr to be tensor"
     parameter_values = parameter_data.detach().numpy()
 
-    if is_bias_node_for_quantized_conv(node):
-        # BI bias
-        assert tosa_spec.support_integer(), f"{tosa_spec} doesnt't support integer"
-        process_quantized_bias(node, tosa_graph, parameter_values)
-    else:
-        # MI weights or bias
-        if inputs[0].dtype == torch.float32:
-            assert tosa_spec.support_float(), f"{tosa_spec} doesn't support float"
+    if inputs[0].dtype == torch.float32:
+        assert tosa_spec.support_float(), f"{tosa_spec} doesn't support float"
 
-        parameter_values = np.transpose(parameter_values, inputs[0].dim_order)
+    parameter_values = np.transpose(parameter_values, inputs[0].dim_order)
 
-        tosa_graph.addConst(
-            parameter_values.shape, inputs[0].dtype, parameter_values, name=node.name
-        )
+    tosa_graph.addConst(
+        parameter_values.shape, inputs[0].dtype, parameter_values, name=node.name
+    )
 
 
 def process_inputs_to_buffers(
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index 9ddeb61c30..9c4187d32a 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -196,7 +196,7 @@ def get_quant_properties(  # noqa: C901
     input_act_qspec = quantization_config.get_input_act_qspec()
     weight_qspec = quantization_config.get_weight_qspec()
     output_act_qspec = quantization_config.get_output_act_qspec()
-    bias_qspec = quantization_config.get_bias_qspec()
+    bias_qspec = quantization_config.get_bias_qspec(node)
 
     quant_properties = _OpQuantProperties()
 
diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py
index 1e776d37a6..b94d9bda64 100644
--- a/backends/arm/quantizer/quantization_config.py
+++ b/backends/arm/quantizer/quantization_config.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,8 +9,10 @@
 from dataclasses import dataclass
 
 import torch
+from torch.ao.quantization import ObserverOrFakeQuantize
 
 from torch.ao.quantization.quantizer import (
+    DerivedQuantizationSpec,
     FixedQParamsQuantizationSpec,
     QuantizationSpec,
 )
@@ -53,8 +55,42 @@ def get_weight_qspec(self) -> QuantizationSpec | None:
         ], f"Unsupported quantization_spec {self.weight} for weight"
         return self.weight
 
-    def get_bias_qspec(self) -> QuantizationSpec | None:
+    def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None:
         """Returns QuantizationSpec 'bias' after asserting that bias.dtype is torch.float."""
+
+        def _derive_qparams_fn(
+            obs_or_fqs: list[ObserverOrFakeQuantize],
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            assert (
+                len(obs_or_fqs) == 2
+            ), "Expecting two obs/fqs, one for activation and one for weight, got: {}".format(
+                len(obs_or_fqs)
+            )
+            act_obs_or_fq = obs_or_fqs[0]
+            weight_obs_or_fq = obs_or_fqs[1]
+            act_scale, act_zp = act_obs_or_fq.calculate_qparams()
+            weight_scale, weight_zp = weight_obs_or_fq.calculate_qparams()
+            return torch.tensor([act_scale * weight_scale]).to(
+                torch.float32
+            ), torch.tensor([0]).to(torch.int32)
+
+        if node.target in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.linear.default,
+        ]:
+            input_act = node.args[0]
+            weight = node.args[1]
+            quantization_spec = DerivedQuantizationSpec(
+                derived_from=[(input_act, node), (weight, node)],
+                derive_qparams_fn=_derive_qparams_fn,
+                dtype=torch.int32,
+                quant_min=torch.iinfo(torch.int32).min,
+                quant_max=torch.iinfo(torch.int32).max - 1,
+                qscheme=torch.per_tensor_symmetric,
+            )
+            return quantization_spec
+
         if self.bias is None:
             return None
         assert (
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index b2fc271ade..a9491418a4 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -197,10 +197,10 @@ def test_collate_tosa_BI_tests(self):
             "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"
         )
         assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag5.tosa"
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6.tosa"
         )
         assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag5.json"
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6.json"
         )
 
         os.environ.pop("TOSA_TESTCASES_BASE_PATH")
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index 5bda9bbf18..c03e0ef0bb 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -133,19 +133,6 @@ def build_reshape(tosa_fb, input_name, new_shape, output_name):
     tosa_fb.addOperator(TosaOp.Op().RESHAPE, [input_name], [output_name], attr)
 
 
-def is_bias_node_for_quantized_conv(node):
-    consumer_node = list(node.users)[0]
-
-    if (
-        consumer_node.target == exir_ops.edge.aten.convolution.default
-        and consumer_node.args[2] == node
-        and consumer_node.meta["val"].dtype == torch.int8
-    ):
-        return True
-
-    return False
-
-
 def is_consumer_node_depthwise_conv2d(node):
     consumer_node = list(node.users)[0]
     if consumer_node.target == exir_ops.edge.aten.convolution.default:

From 85d274a3d20b6aae43372f2bec40c5f43535775c Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com>
Date: Wed, 15 Jan 2025 10:00:44 +0100
Subject: [PATCH 07/40] Arm backend: Size adjust conv2d pass improvements
 (#7646)

[Arm backend] Improve the documentation of the size adjust conv2d pass and remove
duplicated code.

Also add more tests to conv1d and conv2d that need to go through the pass.
---
 .../arm/_passes/size_adjust_conv2d_pass.py    | 103 +++++++++--------
 backends/arm/test/ops/test_conv1d.py          |  45 ++++++++
 backends/arm/test/ops/test_conv2d.py          | 105 +++++++++++++++++-
 3 files changed, 200 insertions(+), 53 deletions(-)

diff --git a/backends/arm/_passes/size_adjust_conv2d_pass.py b/backends/arm/_passes/size_adjust_conv2d_pass.py
index 08da9a74c9..ee81127343 100644
--- a/backends/arm/_passes/size_adjust_conv2d_pass.py
+++ b/backends/arm/_passes/size_adjust_conv2d_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -6,68 +6,69 @@
 
 # pyre-unsafe
 
-from typing import cast, Optional
+from typing import cast
 
 import torch.fx
+from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch._ops import OpOverload
 
 
 def conv_remainder(input_length, pad, dilation, weight, stride):
     """
-    Returns the size
+    Returns the remainder of input_length; given the padding, dilation, stride,
+    and kernel size.
     """
     return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride
 
 
-def insert_q_dq_pair(
-    graph: torch.fx.Graph,
-    anchor: torch.fx.Node,
-    q_params: tuple,
-):
-    with graph.inserting_after(anchor):
-        q = create_node(
-            graph=graph,
-            op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            args=(),  # We add the argument last
-        )
-        q.meta = anchor.meta
-
-    with graph.inserting_after(q):
-        dq = create_node(
-            graph=graph,
-            op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-            args=(q,) + q_params,
-        )
-        dq.meta = q.meta
-
-    anchor.replace_all_uses_with(dq)
-    # We add this last so the replace all uses above does not replace the quantized
-    # node's first use
-    q.args = (anchor,) + q_params
-    return dq
-
-
-def create_node(
-    graph: torch.fx.Graph,
-    op_target: OpOverload,
-    args: tuple = (),
-    kwargs: Optional[dict] = None,
-):
-    return graph.create_node(
-        "call_function",
-        op_target,
-        args=args,
-        kwargs=kwargs or {},
-    )
-
-
 class SizeAdjustConv2DPass(ExportPass):
     """
-    Adjust the convolution input size to match perfectly with the
-    weight size, padding, stride and dilation parameters.
-    This is done by inserting a slice op to remove the uneven end of the input.
+    Adjust the convolution input size to match the kernel size, padding, stride,
+    and dilation parameters. Pytorch allows the input and kernel shape to not
+    "match", in which case the remaining rows/columns are truncated. However,
+    matching the size is a requirement in the TOSA specification. In case the
+    input and kernel shape do not match, the following is done to meet the
+    specification:
+
+      1) The padding is truncated (done in the node visitor)
+      2) (if neccessary) The input is truncated (done in this pass)."
+
+    A simple example would be a 2x2 kernel (no padding, stride=2) and a 5x5
+    input:
+
+    ┌───┬───┬───┬───┬───┐    ┌───┬───┬───┬───┬───┐    ┌───┬───┬───┬───┬───┐
+    │ X │ X │   │   │   │    │   │   │ X │ X │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │ X │ X │   │   │   │    │   │   │ X │ X │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │ -> │   │   │   │   │   │ -> │ X │ X │   │   │   │ ->
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │   │   │   │   │   │    │ X │ X │   │   │   │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │   │   │   │   │   │    │   │   │   │   │   │
+    └───┴───┴───┴───┴───┘    └───┴───┴───┴───┴───┘    └───┴───┴───┴───┴───┘
+         First pass               second pass              third pass
+
+    ┌───┬───┬───┬───┬───┐    ┌───┬───┬───┬───┬───┐
+    │   │   │   │   │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │ X │ X │   │ -> │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │ X │ X │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │ - │ - │ - │ - │ - │
+    └───┴───┴───┴───┴───┘    └───┴───┴───┴───┴───┘
+         Fourth pass            Unvisited cells
+
+    Cells that are never visited are marked with `-` and are never considered
+    when the kernel traverses over the input, hence they can be removed.
+
+    To match the shape of the kernel (and all parameters) with the input, a
+    slice op is inserted to remove the remaining edges (rows and columns) of the
+    input.
     """
 
     conv2d_op = exir_ops.edge.aten.convolution.default
@@ -109,9 +110,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             with graph_module.graph.inserting_before(node):
                 last_node = cast(torch.fx.Node, input_node)
                 for args in slice_args:
-                    slice_node = graph.create_node(
-                        "call_function", self.slice_op, (last_node,) + args
-                    )
+                    slice_node = create_node(graph, self.slice_op, (last_node,) + args)
                     last_node = slice_node
                 conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node)
                 modified_graph = True
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index b754a91f36..3e0dfa6c5c 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -180,6 +180,47 @@ def forward(self, x):
     batches=1,
 )
 
+conv1d_7_1x3x16_st2_pd1_dl2 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=7,
+    stride=2,
+    padding=1,
+    dilation=2,
+    length=16,
+    batches=1,
+)
+conv1d_7_1x3x15_st1_pd0_dl1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=7,
+    stride=1,
+    padding=0,
+    dilation=1,
+    length=15,
+    batches=1,
+)
+conv1d_5_1x3x14_st5_pd0_dl1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=5,
+    stride=5,
+    padding=0,
+    dilation=1,
+    length=14,
+    batches=1,
+)
+conv1d_5_1x3x9_st5_pd0_dl1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=5,
+    stride=5,
+    padding=0,
+    dilation=1,
+    length=9,
+    batches=1,
+)
+
 two_conv1d_nobias = Conv1d(
     nbr_conv=2,
     length=256,
@@ -214,6 +255,10 @@ def forward(self, x):
     ("2_1x2x14_st2", conv1d_2_1x2x14_st2),
     ("5_3x2x128_st1", conv1d_5_3x2x128_st1),
     ("3_1x3x224_st2_pd1", conv1d_3_1x3x224_st2_pd1),
+    ("7_1x3x16_st2_pd1_dl2_needs_adjust_pass", conv1d_7_1x3x16_st2_pd1_dl2),
+    ("7_1x3x15_st1_pd0_dl1_needs_adjust_pass", conv1d_7_1x3x15_st1_pd0_dl1),
+    ("5_1x3x14_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x14_st5_pd0_dl1),
+    ("5_1x3x9_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x9_st5_pd0_dl1),
     ("two_conv1d_nobias", two_conv1d_nobias),
     ("two_conv1d", two_conv1d),
 ]
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index bbcb421ce7..b80228c6f2 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -201,6 +201,101 @@ def forward(self, x):
     batches=1,
 )
 
+conv2d_7x7_1x3x16x16_st2_pd1_dl2 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(7, 7),
+    stride=2,
+    padding=1,
+    dilation=2,
+    width=16,
+    height=16,
+    batches=1,
+)
+
+conv2d_7x7_1x3x15x15_st1_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(7, 7),
+    stride=1,
+    padding=0,
+    dilation=1,
+    width=15,
+    height=15,
+    batches=1,
+)
+
+conv2d_5x5_1x3x14x14_st5_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(5, 5),
+    stride=5,
+    padding=0,
+    dilation=1,
+    width=14,
+    height=14,
+    batches=1,
+)
+
+conv2d_5x5_1x3x9x9_st5_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(5, 5),
+    stride=5,
+    padding=0,
+    dilation=1,
+    width=9,
+    height=9,
+    batches=1,
+)
+
+conv2d_3x3_1x3x8x9_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=8,
+    height=9,
+    batches=1,
+)
+
+conv2d_3x3_1x3x9x8_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=8,
+    height=9,
+    batches=1,
+)
+
+conv2d_3x4_1x3x7x7_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 4),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+)
+
+conv2d_4x3_1x3x7x7_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(4, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+)
 
 two_conv2d_nobias = Conv2d(
     nbr_conv=2,
@@ -236,7 +331,15 @@ def forward(self, x):
     ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1),
     ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1),
     ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2),
-    ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1),
+    ("5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1),
+    ("7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass", conv2d_7x7_1x3x16x16_st2_pd1_dl2),
+    ("7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass", conv2d_7x7_1x3x15x15_st1_pd0_dl1),
+    ("5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x14x14_st5_pd0_dl1),
+    ("5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x9x9_st5_pd0_dl1),
+    ("3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x9x8_st3_pd0_dl1),
+    ("3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x8x9_st3_pd0_dl1),
+    ("3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_3x4_1x3x7x7_st3_pd0_dl1),
+    ("4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_4x3_1x3x7x7_st3_pd0_dl1),
     ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1),
     ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1),
     ("two_conv2d_nobias", two_conv2d_nobias),

From 01d526f8b3897f1ea1ebe6fe6517b59222154f7a Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@meta.com>
Date: Wed, 15 Jan 2025 01:25:19 -0800
Subject: [PATCH 08/40] make dim order as default for everywhere in ET

Differential Revision: D68167741

Pull Request resolved: https://github.com/pytorch/executorch/pull/7658
---
 extension/export_util/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py
index 66154b95fa..a289355919 100644
--- a/extension/export_util/utils.py
+++ b/extension/export_util/utils.py
@@ -60,7 +60,6 @@ def _core_aten_to_edge(
     if not edge_compile_config:
         edge_compile_config = exir.EdgeCompileConfig(
             _check_ir_validity=False,  # quant ops currently break ir verification
-            _skip_dim_order=True,  # TODO(T182928844): dim order ops can not delegate to backend
         )
     edge_manager: EdgeProgramManager = to_edge(
         core_aten_exir_ep,

From a18f6e89631ab97f6211aaae537bfe2cdd90729e Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 15 Jan 2025 10:58:51 -0500
Subject: [PATCH 09/40] Update torchtune pin (#7670)

---
 examples/models/llama3_2_vision/install_requirements.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh
index a4d789d56e..4d4a6f2862 100755
--- a/examples/models/llama3_2_vision/install_requirements.sh
+++ b/examples/models/llama3_2_vision/install_requirements.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-NIGHTLY_VERSION="dev20241112"
+NIGHTLY_VERSION="dev20250115"
 
 # Install torchtune nightly for model definitions.
-pip install --pre torchtune==0.4.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
+pip install --pre torchtune==0.6.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir

From d596cd78cf2280c9c01adbfc95b54a29865f3fe5 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 15 Jan 2025 13:34:43 -0600
Subject: [PATCH 10/40] [ET-VK][ez] Test specific sizes of linear sizes in
 generated operator tests (#7672)

Pull Request resolved: https://github.com/pytorch/executorch/pull/7667

## Context

Recent changes related to checking SPIR-V capability support at runtime have made it possible to test the 8-bit quantized linear compute shader on Android devices. Previously the test would be automatically skipped since the operator potentially uses 8-bit data types.

To make the generated tests more useful, instead test real sizes of linear layer settings found in a sample model in the 8-bit linear test case.
ghstack-source-id: 261524380
@exported-using-ghexport

Differential Revision: [D68192068](https://our.internmc.facebook.com/intern/diff/D68192068/)

Co-authored-by: Stephen Jia <ssjia@meta.com>
---
 backends/vulkan/test/op_tests/cases.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index d32fa71573..9cec4891c1 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -169,7 +169,13 @@ def get_linear_test_suites():
 
 @register_test_suite("aten._weight_int8pack_mm.default")
 def get_weight_int8pack_mm_inputs():
-    MKN_list = common_MKN_list
+    MKN_list = [
+        [6, 480, 256],
+        [6, 256, 1024],
+        [6, 1024, 256],
+        [6, 256, 256],
+        [6, 256, 512],
+    ]
 
     inputs_list = [((M, K), (N, K), (N)) for M, K, N in MKN_list]
 

From 0dbe214712cff2d0eac1d33743ae26aa67d19d04 Mon Sep 17 00:00:00 2001
From: wesleyer <wesleyer@gmail.com>
Date: Wed, 15 Jan 2025 12:41:57 -0800
Subject: [PATCH 11/40] Fix duplication of  classes in modules

Differential Revision: D68027619

Pull Request resolved: https://github.com/pytorch/executorch/pull/7606
---
 extension/android/BUCK | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/extension/android/BUCK b/extension/android/BUCK
index 5d021250e6..040c9258d4 100644
--- a/extension/android/BUCK
+++ b/extension/android/BUCK
@@ -23,18 +23,13 @@ fb_android_library(
 fb_android_library(
     name = "executorch_llama",
     srcs = [
-        "src/main/java/org/pytorch/executorch/DType.java",
-        "src/main/java/org/pytorch/executorch/EValue.java",
         "src/main/java/org/pytorch/executorch/LlamaCallback.java",
         "src/main/java/org/pytorch/executorch/LlamaModule.java",
-        "src/main/java/org/pytorch/executorch/Module.java",
-        "src/main/java/org/pytorch/executorch/NativePeer.java",
-        "src/main/java/org/pytorch/executorch/Tensor.java",
-        "src/main/java/org/pytorch/executorch/annotations/Experimental.java",
     ],
     autoglob = False,
     language = "JAVA",
     deps = [
+        ":executorch",
         "//fbandroid/java/com/facebook/jni:jni",
         "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader",
     ],

From ba6c55211a03be8cca93750f510ff06c22a11a1b Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 15 Jan 2025 16:16:11 -0500
Subject: [PATCH 12/40] Enable bot to cc people (#7680)

Summary:

The bot will configure based on this tracking issue https://github.com/pytorch/executorch/issues/7679
---
 .github/pytorch-probot.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 2b66829ed0..a83087c8d5 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,4 +1,5 @@
 # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml
+tracking_issue: 7679
 ciflow_push_tags:
 - ciflow/android
 - ciflow/apple

From a727b55b3584c9f6330eae3c4762a2ca500247dc Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Wed, 15 Jan 2025 13:54:50 -0800
Subject: [PATCH 13/40] fix delegate cache duplicate bug

Differential Revision: D67067997

Pull Request resolved: https://github.com/pytorch/executorch/pull/7281
---
 exir/_serialize/_program.py         |  8 +++--
 exir/backend/test/demos/rpc/TARGETS |  1 +
 exir/emit/_emitter.py               | 23 ++++++++----
 exir/emit/test/TARGETS              |  1 +
 exir/emit/test/test_emit.py         | 56 ++++++++++++++++++++++++++++-
 5 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
index 80b740674a..7656ea3f36 100644
--- a/exir/_serialize/_program.py
+++ b/exir/_serialize/_program.py
@@ -224,6 +224,7 @@ def _extract_delegate_segments(
     """
     remaining_inline: List[BackendDelegateInlineData] = []
     inline_indices_seen: set[int] = set()
+    segment_index_map: dict[bytes, int] = {}
     for plan in program.execution_plan:
         for delegate in plan.delegates:
             if delegate.processed.location != DataLocation.INLINE:
@@ -249,8 +250,11 @@ def _extract_delegate_segments(
             inline_indices_seen.add(delegate.processed.index)
             if inline.data:
                 # Move the delegate data out of the program.
-                segment_index = len(segments)
-                segments.append(Cord(inline.data))
+                segment_index = segment_index_map.get(inline.data)
+                if segment_index is None:
+                    segment_index = len(segments)
+                    segments.append(Cord(inline.data))
+                    segment_index_map[inline.data] = segment_index
                 delegate.processed = BackendDelegateDataReference(
                     location=DataLocation.SEGMENT,
                     index=segment_index,
diff --git a/exir/backend/test/demos/rpc/TARGETS b/exir/backend/test/demos/rpc/TARGETS
index a2aadb05ef..63d24ccbda 100644
--- a/exir/backend/test/demos/rpc/TARGETS
+++ b/exir/backend/test/demos/rpc/TARGETS
@@ -28,6 +28,7 @@ runtime.python_library(
     ],
     visibility = [
         "//executorch/exir/backend/test/...",
+        "//executorch/exir/emit/test/...",
     ],
     deps = [
         ":executor_backend_preprocess",
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index d08e68fa73..c40a00b240 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -122,6 +122,8 @@ class _ProgramState:
     # Delegate data stored directly in the flatbuffer. Pointed to by BackendDelegateDataReference,
     # and should be copied to Program.backend_delegate_data.
     backend_delegate_data: List[BackendDelegateInlineData] = field(default_factory=list)
+    # Delegate cache that is used across all entry points. Key is the hash of the delegated payload.
+    backend_delegate_data_cache: Dict[str, int] = field(default_factory=dict)
 
     # Constants are optionally stored in external files.
     # Aggregate unique external constants into one buffer.
@@ -144,7 +146,8 @@ class _EmitterState:
     operators: List[Operator]
     delegates: List[BackendDelegate]
     operator_cache: Dict[Tuple[str, str], int]
-    delegate_cache: Dict[bytes, int]
+    # delegate_cache: the key is hash(delegated_payload) and the value is the index in delegates
+    delegate_cache: Dict[str, int]
     emit_stacktrace: bool
 
     spec2id_dict: Dict[TensorSpec, int] = field(default_factory=dict)
@@ -1073,8 +1076,8 @@ def _emit_delegate(
         """Emit the delegates inputs and outputs as specified by the schema, then emit the
         delegate's blob."""
         processed_bytes = lowered_module.processed_bytes
-
-        delegate_index = self.emitter_state.delegate_cache.get(processed_bytes)
+        hashed = hashlib.sha256(processed_bytes).hexdigest()
+        delegate_index = self.emitter_state.delegate_cache.get(hashed)
         delegate_ret = None
 
         if isinstance(self.node.meta["spec"], list):
@@ -1112,10 +1115,16 @@ def _emit_delegate(
         if delegate_index is None:
             # Allocate an entry for the data. TODO(T150113674): Reuse any duplicate entries if
             # present.
-            data_index: int = len(self.program_state.backend_delegate_data)
-            self.program_state.backend_delegate_data.append(
-                BackendDelegateInlineData(data=processed_bytes)
+            hashed = hashlib.sha256(processed_bytes).hexdigest()
+            data_index: Optional[int] = (
+                self.program_state.backend_delegate_data_cache.get(hashed)
             )
+            if data_index is None:
+                data_index = len(self.program_state.backend_delegate_data)
+                self.program_state.backend_delegate_data_cache[hashed] = data_index
+                self.program_state.backend_delegate_data.append(
+                    BackendDelegateInlineData(data=processed_bytes)
+                )
 
             backend_delegate = BackendDelegate(
                 id=lowered_module.backend_id,
@@ -1126,7 +1135,7 @@ def _emit_delegate(
             )
             delegate_index = len(self.emitter_state.delegate_cache)
             self.emitter_state.delegates.append(backend_delegate)
-            self.emitter_state.delegate_cache[processed_bytes] = delegate_index
+            self.emitter_state.delegate_cache[hashed] = delegate_index
 
         # TODO(angelayi) Will need to emit the kwargs too, in the correct order according to the
         # function's spec and with default arguments. This requires us to store the function's spec
diff --git a/exir/emit/test/TARGETS b/exir/emit/test/TARGETS
index 9f416e78ea..153843d45e 100644
--- a/exir/emit/test/TARGETS
+++ b/exir/emit/test/TARGETS
@@ -16,6 +16,7 @@ python_unittest(
         "//executorch/exir:lib",
         "//executorch/exir:print_program",
         "//executorch/exir:schema",
+        "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner",
         "//executorch/exir/backend:backend_api",
         "//executorch/exir/emit:lib",
         "//executorch/exir/passes:const_prop_pass",
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 0da4085914..3fca3958fe 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -27,6 +27,9 @@
 from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.backend.test.demos.rpc.executor_backend_partitioner import (
+    ExecutorBackendPartitioner,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.emit import emit_program  # noqa
 from executorch.exir.error import InternalError
@@ -63,7 +66,7 @@
 from functorch.experimental import control_flow
 from torch import nn
 
-from torch.export import Dim, export
+from torch.export import Dim, export, export_for_training
 
 
 class WrapperModule(torch.nn.Module):
@@ -1679,3 +1682,54 @@ def forward(self, x):
         ]
         self.assertEqual(external_map["linear.weight"], 0)
         self.assertEqual(external_map["linear.bias"], 1)
+
+    def test_delegate_deduplicate(self) -> None:
+        class SharedModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        class Module1(torch.nn.Module):
+            def __init__(self, shared_module):
+                super().__init__()
+                self.shared_module = shared_module
+
+            def forward(self, x):
+                return self.shared_module(x)
+
+        class Module2(torch.nn.Module):
+            def __init__(self, shared_module):
+                super().__init__()
+                self.shared_module = shared_module
+
+            def forward(self, x):
+                return self.shared_module(x)
+
+        shared_module = SharedModule()
+        module_1 = Module1(shared_module)
+        module_2 = Module2(shared_module)
+        example_inputs = (torch.randn(2, 2),)
+        module_1(*example_inputs)
+        module_2(*example_inputs)
+
+        ep1 = export_for_training(module_1, example_inputs)
+        ep2 = export_for_training(module_2, example_inputs)
+
+        edge_program_manager = exir.to_edge(
+            {"forward1": ep1, "forward2": ep2},
+            compile_config=exir.EdgeCompileConfig(
+                _check_ir_validity=False, _use_edge_ops=True
+            ),
+        )
+
+        edge_program_manager = edge_program_manager.to_backend(
+            ExecutorBackendPartitioner()
+        ).to_executorch()
+
+        # Check that there is only one delegate because two methods are exactly the same
+        self.assertEqual(
+            len(edge_program_manager.executorch_program.backend_delegate_data), 1
+        )

From ee00caa7ed26917a05706e62a5850e6a2454d42c Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Wed, 15 Jan 2025 18:55:24 -0600
Subject: [PATCH 14/40] [xnnpack] Add debug XNNGraph printing (#7617)

Prints to a file, with increasing id.
TODO: use actual delegate instance id in the filename. Take filepath from
compile_spec.
---
 .../serialization/xnnpack_graph_serialize.py  | 38 ++++++++++++++++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/backends/xnnpack/serialization/xnnpack_graph_serialize.py b/backends/xnnpack/serialization/xnnpack_graph_serialize.py
index 160c926780..0fbd0ddc5e 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_serialize.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_serialize.py
@@ -5,11 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+
+import logging
 import os
 import tempfile
 
 from dataclasses import dataclass, fields, is_dataclass
-from typing import ClassVar, Literal
+from typing import ClassVar, Literal, Optional
 
 import pkg_resources
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph
@@ -17,6 +19,9 @@
 
 from executorch.exir._serialize._flatbuffer import _flatc_compile
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 # Byte order of numbers written to program headers. Always little-endian
 # regardless of the host system, since all commonly-used modern CPUs are little
 # endian.
@@ -273,19 +278,42 @@ def _pad_to(data: bytes, length: int) -> bytes:
     return data
 
 
-def pretty_print_xnngraph(xnnpack_graph_json: str):
+def pretty_print_xnngraph(xnnpack_graph_json: str, filename: Optional[str] = None):
     """
-    Pretty print the XNNGraph
+    Pretty print the XNNGraph, optionally writing to a file if filename is provided
     """
-    from pprint import pprint
+    from pprint import pformat
 
     d = json.loads(xnnpack_graph_json)
-    pprint(d)
+    pstr = pformat(d, indent=2, compact=True).replace("'", '"')
+    if filename:
+        with open(filename, "w") as f:
+            if filename.endswith(".json"):
+                pstr = pstr.replace("None", "null")
+            f.write(pstr)
+    else:  # dump to stdout
+        print("XNNGraph:")
+        print(pstr)
+        print("End of XNNGraph")
+
+
+# TODO: Replace this with an actual delegate id
+_delegate_instance_id = 0
 
 
 def convert_to_flatbuffer(xnnpack_graph: XNNGraph) -> bytes:
+    global _delegate_instance_id
     sanity_check_xnngraph_dataclass(xnnpack_graph)
     xnnpack_graph_json = json.dumps(xnnpack_graph, cls=_DataclassEncoder)
+
+    # Log the XNNGraph if debugging
+    if logger.getEffectiveLevel() == logging.DEBUG:
+        filename: str = f"./xnnpack_delegate_graph_{_delegate_instance_id}.json"
+        logger.debug(f"Writing XNNGraph to {filename}")
+        pretty_print_xnngraph(xnnpack_graph_json, filename)
+
+    _delegate_instance_id += 1
+
     with tempfile.TemporaryDirectory() as d:
         schema_path = os.path.join(d, "schema.fbs")
         with open(schema_path, "wb") as schema_file:

From 6aa5c8a1a473197f403c0b7a8804c39f4126face Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com>
Date: Wed, 15 Jan 2025 23:40:52 -0800
Subject: [PATCH 15/40] Add uint16 to supported dtypes and regenerate edge.yaml

Differential Revision: D68241997

Pull Request resolved: https://github.com/pytorch/executorch/pull/7687
---
 exir/dialects/edge/dtype/runner.py        |    4 +-
 exir/dialects/edge/edge.yaml              | 1308 +++++++++++++--------
 exir/dialects/edge/test/test_edge_yaml.py |    1 +
 3 files changed, 832 insertions(+), 481 deletions(-)

diff --git a/exir/dialects/edge/dtype/runner.py b/exir/dialects/edge/dtype/runner.py
index ef488433fb..67982a164e 100644
--- a/exir/dialects/edge/dtype/runner.py
+++ b/exir/dialects/edge/dtype/runner.py
@@ -17,7 +17,9 @@
 
 class DtypeRunner:
     def __init__(self):
-        self.tensor_dtypes = list(common_dtype.all_types_and(torch.bool, torch.half))
+        self.tensor_dtypes = list(
+            common_dtype.all_types_and(torch.bool, torch.half, torch.uint16)
+        )
         self.scalar_dtypes = [torch.bool, torch.int, torch.float]
 
     @staticmethod
diff --git a/exir/dialects/edge/edge.yaml b/exir/dialects/edge/edge.yaml
index 3d052fc944..039490a839 100644
--- a/exir/dialects/edge/edge.yaml
+++ b/exir/dialects/edge/edge.yaml
@@ -26,7 +26,7 @@
   inherits: aten::_to_copy
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
     T3: [Char]
     T4: [Double]
@@ -35,6 +35,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T1
     dtype: T0
@@ -63,6 +64,9 @@
   - self: T1
     dtype: T9
     __ret_0: T9
+  - self: T1
+    dtype: T10
+    __ret_0: T10
 
 - func: aten::abs
   namespace: edge
@@ -77,7 +81,7 @@
   namespace: edge
   inherits: aten::acos
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -90,7 +94,7 @@
   namespace: edge
   inherits: aten::acosh
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -104,7 +108,7 @@
   inherits: aten::add.Scalar
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Float, Int, Long, Short]
+    T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Bool, Int]
     T4: [Bool, Long]
@@ -117,6 +121,7 @@
     T11: [Int]
     T12: [Long]
     T13: [Short]
+    T14: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -238,6 +243,10 @@
     other: T8
     alpha: T9
     __ret_0: T8
+  - self: T14
+    other: T8
+    alpha: T9
+    __ret_0: T8
 
 - func: aten::add.Tensor
   namespace: edge
@@ -245,9 +254,9 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T4: [Bool, Byte, Char, Half, Int, Long, Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T5: [Bool, Byte, Char, Int, Long, Short]
     T6: [Bool, Byte, Char, Int, Short]
     T7: [Bool, Byte, Char, Short]
@@ -264,6 +273,7 @@
     T18: [Int]
     T19: [Long]
     T20: [Short]
+    T21: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -417,6 +427,10 @@
     other: T20
     alpha: T16
     __ret_0: T14
+  - self: T14
+    other: T21
+    alpha: T16
+    __ret_0: T14
   - self: T15
     other: T0
     alpha: T16
@@ -461,6 +475,10 @@
     other: T20
     alpha: T16
     __ret_0: T15
+  - self: T15
+    other: T21
+    alpha: T16
+    __ret_0: T15
   - self: T17
     other: T0
     alpha: T16
@@ -505,6 +523,10 @@
     other: T20
     alpha: T16
     __ret_0: T17
+  - self: T17
+    other: T21
+    alpha: T16
+    __ret_0: T17
   - self: T18
     other: T6
     alpha: T18
@@ -553,6 +575,18 @@
     other: T17
     alpha: T16
     __ret_0: T17
+  - self: T21
+    other: T14
+    alpha: T16
+    __ret_0: T14
+  - self: T21
+    other: T15
+    alpha: T16
+    __ret_0: T15
+  - self: T21
+    other: T17
+    alpha: T16
+    __ret_0: T17
 
 - func: aten::addmm
   namespace: edge
@@ -862,7 +896,7 @@
   namespace: edge
   inherits: aten::alias_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -890,7 +924,7 @@
   inherits: aten::any
   type_alias:
     T0: [Bool]
-    T1: [Bool, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
   type_constraint:
   - self: T1
@@ -2057,7 +2091,7 @@
   namespace: edge
   inherits: aten::as_strided_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -2066,7 +2100,7 @@
   namespace: edge
   inherits: aten::asin
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -2079,7 +2113,7 @@
   namespace: edge
   inherits: aten::asinh
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -2092,7 +2126,7 @@
   namespace: edge
   inherits: aten::atan
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -2105,7 +2139,7 @@
   namespace: edge
   inherits: aten::atanh
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -2135,6 +2169,7 @@
     T5: [Int]
     T6: [Long]
     T7: [Short]
+    T8: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -2157,17 +2192,20 @@
   - self: T7
     other: T1
     __ret_0: T7
+  - self: T8
+    other: T1
+    __ret_0: T8
 
 - func: aten::bitwise_and.Tensor
   namespace: edge
   inherits: aten::bitwise_and.Tensor
   type_alias:
-    T0: [Bool]
-    T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Int, Long, Short]
-    T3: [Bool, Byte, Char, Int, Short]
-    T4: [Bool, Byte, Char, Short]
-    T5: [Bool, Char]
+    T0: [Bool, Byte]
+    T1: [Bool, Byte, Char, Int, Long, Short]
+    T2: [Bool, Byte, Char, Int, Short]
+    T3: [Bool, Byte, Char, Short]
+    T4: [Bool, Char]
+    T5: [Bool, UInt16]
     T6: [Byte]
     T7: [Byte, Short]
     T8: [Char]
@@ -2177,25 +2215,25 @@
     T12: [Short]
   type_constraint:
   - self: T0
-    other: T0
-    __ret_0: T0
-  - self: T1
     other: T6
     __ret_0: T6
-  - self: T2
+  - self: T1
     other: T11
     __ret_0: T11
-  - self: T3
+  - self: T2
     other: T10
     __ret_0: T10
-  - self: T4
+  - self: T3
     other: T12
     __ret_0: T12
-  - self: T5
+  - self: T4
     other: T8
     __ret_0: T8
+  - self: T5
+    other: T5
+    __ret_0: T5
   - self: T6
-    other: T1
+    other: T0
     __ret_0: T6
   - self: T6
     other: T9
@@ -2204,7 +2242,7 @@
     other: T8
     __ret_0: T12
   - self: T8
-    other: T5
+    other: T4
     __ret_0: T8
   - self: T8
     other: T7
@@ -2213,13 +2251,13 @@
     other: T6
     __ret_0: T12
   - self: T10
-    other: T3
+    other: T2
     __ret_0: T10
   - self: T11
-    other: T2
+    other: T1
     __ret_0: T11
   - self: T12
-    other: T4
+    other: T3
     __ret_0: T12
 
 - func: aten::bitwise_not
@@ -2243,6 +2281,7 @@
     T5: [Int]
     T6: [Long]
     T7: [Short]
+    T8: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -2265,17 +2304,20 @@
   - self: T7
     other: T1
     __ret_0: T7
+  - self: T8
+    other: T1
+    __ret_0: T8
 
 - func: aten::bitwise_or.Tensor
   namespace: edge
   inherits: aten::bitwise_or.Tensor
   type_alias:
-    T0: [Bool]
-    T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Int, Long, Short]
-    T3: [Bool, Byte, Char, Int, Short]
-    T4: [Bool, Byte, Char, Short]
-    T5: [Bool, Char]
+    T0: [Bool, Byte]
+    T1: [Bool, Byte, Char, Int, Long, Short]
+    T2: [Bool, Byte, Char, Int, Short]
+    T3: [Bool, Byte, Char, Short]
+    T4: [Bool, Char]
+    T5: [Bool, UInt16]
     T6: [Byte]
     T7: [Byte, Short]
     T8: [Char]
@@ -2285,25 +2327,25 @@
     T12: [Short]
   type_constraint:
   - self: T0
-    other: T0
-    __ret_0: T0
-  - self: T1
     other: T6
     __ret_0: T6
-  - self: T2
+  - self: T1
     other: T11
     __ret_0: T11
-  - self: T3
+  - self: T2
     other: T10
     __ret_0: T10
-  - self: T4
+  - self: T3
     other: T12
     __ret_0: T12
-  - self: T5
+  - self: T4
     other: T8
     __ret_0: T8
+  - self: T5
+    other: T5
+    __ret_0: T5
   - self: T6
-    other: T1
+    other: T0
     __ret_0: T6
   - self: T6
     other: T9
@@ -2312,7 +2354,7 @@
     other: T8
     __ret_0: T12
   - self: T8
-    other: T5
+    other: T4
     __ret_0: T8
   - self: T8
     other: T7
@@ -2321,13 +2363,13 @@
     other: T6
     __ret_0: T12
   - self: T10
-    other: T3
+    other: T2
     __ret_0: T10
   - self: T11
-    other: T2
+    other: T1
     __ret_0: T11
   - self: T12
-    other: T4
+    other: T3
     __ret_0: T12
 
 - func: aten::bitwise_xor.Scalar
@@ -2342,6 +2384,7 @@
     T5: [Int]
     T6: [Long]
     T7: [Short]
+    T8: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -2364,17 +2407,20 @@
   - self: T7
     other: T1
     __ret_0: T7
+  - self: T8
+    other: T1
+    __ret_0: T8
 
 - func: aten::bitwise_xor.Tensor
   namespace: edge
   inherits: aten::bitwise_xor.Tensor
   type_alias:
-    T0: [Bool]
-    T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Int, Long, Short]
-    T3: [Bool, Byte, Char, Int, Short]
-    T4: [Bool, Byte, Char, Short]
-    T5: [Bool, Char]
+    T0: [Bool, Byte]
+    T1: [Bool, Byte, Char, Int, Long, Short]
+    T2: [Bool, Byte, Char, Int, Short]
+    T3: [Bool, Byte, Char, Short]
+    T4: [Bool, Char]
+    T5: [Bool, UInt16]
     T6: [Byte]
     T7: [Byte, Short]
     T8: [Char]
@@ -2384,25 +2430,25 @@
     T12: [Short]
   type_constraint:
   - self: T0
-    other: T0
-    __ret_0: T0
-  - self: T1
     other: T6
     __ret_0: T6
-  - self: T2
+  - self: T1
     other: T11
     __ret_0: T11
-  - self: T3
+  - self: T2
     other: T10
     __ret_0: T10
-  - self: T4
+  - self: T3
     other: T12
     __ret_0: T12
-  - self: T5
+  - self: T4
     other: T8
     __ret_0: T8
+  - self: T5
+    other: T5
+    __ret_0: T5
   - self: T6
-    other: T1
+    other: T0
     __ret_0: T6
   - self: T6
     other: T9
@@ -2411,7 +2457,7 @@
     other: T8
     __ret_0: T12
   - self: T8
-    other: T5
+    other: T4
     __ret_0: T8
   - self: T8
     other: T7
@@ -2420,13 +2466,13 @@
     other: T6
     __ret_0: T12
   - self: T10
-    other: T3
+    other: T2
     __ret_0: T10
   - self: T11
-    other: T2
+    other: T1
     __ret_0: T11
   - self: T12
-    other: T4
+    other: T3
     __ret_0: T12
 
 - func: aten::bmm
@@ -2443,7 +2489,7 @@
   namespace: edge
   inherits: aten::cat
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - tensors: T0
     __ret_0: T0
@@ -2452,7 +2498,7 @@
   namespace: edge
   inherits: aten::ceil
   type_alias:
-    T0: [Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -2462,7 +2508,7 @@
   inherits: aten::clamp
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Float, Int, Long, Short]
+    T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Bool, Int]
     T4: [Bool, Long]
@@ -2474,6 +2520,7 @@
     T10: [Int]
     T11: [Long]
     T12: [Short]
+    T13: [UInt16]
   type_constraint:
   - self: T0
     min: T2
@@ -2715,12 +2762,20 @@
     min: T10
     max: T3
     __ret_0: T12
+  - self: T13
+    min: T2
+    max: T8
+    __ret_0: T8
+  - self: T13
+    min: T8
+    max: T2
+    __ret_0: T8
 
 - func: aten::clone
   namespace: edge
   inherits: aten::clone
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -2739,6 +2794,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     value: T1
@@ -2767,12 +2823,15 @@
   - self: T9
     value: T1
     __ret_0: T9
+  - self: T10
+    value: T1
+    __ret_0: T10
 
 - func: aten::convolution
   namespace: edge
   inherits: aten::convolution
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T1: [Byte]
     T2: [Char]
     T3: [Double]
@@ -2820,7 +2879,7 @@
   inherits: aten::copy
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
     T3: [Char]
     T4: [Double]
@@ -2829,6 +2888,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     src: T1
@@ -2857,12 +2917,15 @@
   - self: T9
     src: T1
     __ret_0: T9
+  - self: T10
+    src: T1
+    __ret_0: T10
 
 - func: aten::cos
   namespace: edge
   inherits: aten::cos
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -2875,7 +2938,7 @@
   namespace: edge
   inherits: aten::cosh
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -2888,7 +2951,7 @@
   namespace: edge
   inherits: aten::cumsum
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T1: [Byte]
     T2: [Char]
     T3: [Double]
@@ -2927,7 +2990,7 @@
   namespace: edge
   inherits: aten::detach_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -2937,7 +3000,7 @@
   inherits: aten::div.Scalar
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Float, Int, Long, Short]
+    T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Byte]
     T4: [Char]
@@ -2947,6 +3010,7 @@
     T8: [Int]
     T9: [Long]
     T10: [Short]
+    T11: [UInt16]
   type_constraint:
   - self: T0
     other: T2
@@ -2984,24 +3048,29 @@
   - self: T10
     other: T2
     __ret_0: T6
+  - self: T11
+    other: T2
+    __ret_0: T6
 
 - func: aten::div.Tensor
   namespace: edge
   inherits: aten::div.Tensor
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
     T3: [Bool, Byte, Char, Float, Int, Long, Short]
-    T4: [Bool, Byte, Char, Half, Int, Long, Short]
+    T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T5: [Byte]
     T6: [Char]
     T7: [Double]
     T8: [Float]
-    T9: [Half]
-    T10: [Int]
-    T11: [Long]
-    T12: [Short]
+    T9: [Float, UInt16]
+    T10: [Half]
+    T11: [Int]
+    T12: [Long]
+    T13: [Short]
+    T14: [UInt16]
   type_constraint:
   - self: T0
     other: T3
@@ -3021,18 +3090,18 @@
   - self: T3
     other: T6
     __ret_0: T8
-  - self: T3
-    other: T10
-    __ret_0: T8
   - self: T3
     other: T11
     __ret_0: T8
   - self: T3
     other: T12
     __ret_0: T8
+  - self: T3
+    other: T13
+    __ret_0: T8
   - self: T4
-    other: T9
-    __ret_0: T9
+    other: T10
+    __ret_0: T10
   - self: T5
     other: T3
     __ret_0: T8
@@ -3046,26 +3115,32 @@
     other: T2
     __ret_0: T8
   - self: T9
-    other: T4
-    __ret_0: T9
-  - self: T10
-    other: T3
+    other: T14
     __ret_0: T8
+  - self: T10
+    other: T4
+    __ret_0: T10
   - self: T11
     other: T3
     __ret_0: T8
   - self: T12
     other: T3
     __ret_0: T8
+  - self: T13
+    other: T3
+    __ret_0: T8
+  - self: T14
+    other: T9
+    __ret_0: T8
 
 - func: aten::div.Tensor_mode
   namespace: edge
   inherits: aten::div.Tensor_mode
   type_alias:
     T0: [Bool, Byte]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T4: [Bool, Byte, Char, Int, Long, Short]
     T5: [Bool, Byte, Char, Int, Short]
     T6: [Bool, Byte, Char, Short]
@@ -3156,6 +3231,7 @@
     T7: [Int, Long]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - weight: T0
     indices: T7
@@ -3184,12 +3260,15 @@
   - weight: T9
     indices: T7
     __ret_0: T9
+  - weight: T10
+    indices: T7
+    __ret_0: T10
 
 - func: aten::empty.memory_format
   namespace: edge
   inherits: aten::empty.memory_format
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - dtype: T0
     __ret_0: T0
@@ -3199,7 +3278,7 @@
   inherits: aten::eq.Scalar
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Byte]
     T4: [Char]
@@ -3209,6 +3288,7 @@
     T8: [Int]
     T9: [Long]
     T10: [Short]
+    T11: [UInt16]
   type_constraint:
   - self: T0
     other: T2
@@ -3246,12 +3326,15 @@
   - self: T10
     other: T2
     __ret_0: T0
+  - self: T11
+    other: T2
+    __ret_0: T0
 
 - func: aten::erf
   namespace: edge
   inherits: aten::erf
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -3264,7 +3347,7 @@
   namespace: edge
   inherits: aten::exp
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -3277,7 +3360,7 @@
   namespace: edge
   inherits: aten::expand_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -3296,6 +3379,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     value: T1
@@ -3324,13 +3408,16 @@
   - self: T9
     value: T1
     __ret_0: T9
+  - self: T10
+    value: T1
+    __ret_0: T10
 
 - func: aten::fill.Tensor
   namespace: edge
   inherits: aten::fill.Tensor
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
     T3: [Char]
     T4: [Double]
@@ -3339,6 +3426,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     value: T1
@@ -3367,12 +3455,15 @@
   - self: T9
     value: T1
     __ret_0: T9
+  - self: T10
+    value: T1
+    __ret_0: T10
 
 - func: aten::floor
   namespace: edge
   inherits: aten::floor
   type_alias:
-    T0: [Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -3382,9 +3473,9 @@
   inherits: aten::floor_divide
   type_alias:
     T0: [Bool, Byte]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T4: [Bool, Byte, Char, Int, Long, Short]
     T5: [Bool, Byte, Char, Int, Short]
     T6: [Bool, Byte, Char, Short]
@@ -3465,7 +3556,7 @@
   namespace: edge
   inherits: aten::fmod.Scalar
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Bool, Float, Int]
     T2: [Bool, Int]
     T3: [Bool, Long]
@@ -3514,9 +3605,9 @@
   inherits: aten::fmod.Tensor
   type_alias:
     T0: [Bool, Byte]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T4: [Bool, Byte, Char, Int, Long, Short]
     T5: [Bool, Byte, Char, Int, Short]
     T6: [Bool, Byte, Char, Short]
@@ -3607,6 +3698,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - fill_value: T1
     dtype: T0
@@ -3635,13 +3727,16 @@
   - fill_value: T1
     dtype: T9
     __ret_0: T9
+  - fill_value: T1
+    dtype: T10
+    __ret_0: T10
 
 - func: aten::full_like
   namespace: edge
   inherits: aten::full_like
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Byte]
     T4: [Char]
@@ -3651,6 +3746,7 @@
     T8: [Int]
     T9: [Long]
     T10: [Short]
+    T11: [UInt16]
   type_constraint:
   - self: T0
     fill_value: T2
@@ -3688,6 +3784,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T0
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T1
     fill_value: T0
     dtype: T0
@@ -3724,6 +3824,10 @@
     fill_value: T0
     dtype: T10
     __ret_0: T10
+  - self: T1
+    fill_value: T0
+    dtype: T11
+    __ret_0: T11
   - self: T1
     fill_value: T6
     dtype: T0
@@ -3760,6 +3864,10 @@
     fill_value: T6
     dtype: T10
     __ret_0: T10
+  - self: T1
+    fill_value: T6
+    dtype: T11
+    __ret_0: T11
   - self: T1
     fill_value: T8
     dtype: T0
@@ -3796,6 +3904,10 @@
     fill_value: T8
     dtype: T10
     __ret_0: T10
+  - self: T1
+    fill_value: T8
+    dtype: T11
+    __ret_0: T11
   - self: T3
     fill_value: T2
     dtype: T0
@@ -3832,6 +3944,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T3
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T4
     fill_value: T2
     dtype: T0
@@ -3868,6 +3984,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T4
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T5
     fill_value: T2
     dtype: T0
@@ -3904,6 +4024,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T5
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T6
     fill_value: T2
     dtype: T0
@@ -3940,6 +4064,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T6
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T7
     fill_value: T2
     dtype: T0
@@ -3976,6 +4104,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T7
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T8
     fill_value: T2
     dtype: T0
@@ -4012,6 +4144,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T8
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T9
     fill_value: T2
     dtype: T0
@@ -4048,6 +4184,10 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
+  - self: T9
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
   - self: T10
     fill_value: T2
     dtype: T0
@@ -4084,58 +4224,103 @@
     fill_value: T2
     dtype: T10
     __ret_0: T10
-
-- func: aten::ge.Scalar
-  namespace: edge
-  inherits: aten::ge.Scalar
-  type_alias:
-    T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Float, Int]
-    T3: [Byte]
-    T4: [Char]
-    T5: [Double]
-    T6: [Float]
-    T7: [Half]
-    T8: [Int]
-    T9: [Long]
-    T10: [Short]
-  type_constraint:
-  - self: T0
-    other: T2
+  - self: T10
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
+  - self: T11
+    fill_value: T2
+    dtype: T0
     __ret_0: T0
-  - self: T1
-    other: T0
+  - self: T11
+    fill_value: T2
+    dtype: T3
+    __ret_0: T3
+  - self: T11
+    fill_value: T2
+    dtype: T4
+    __ret_0: T4
+  - self: T11
+    fill_value: T2
+    dtype: T5
+    __ret_0: T5
+  - self: T11
+    fill_value: T2
+    dtype: T6
+    __ret_0: T6
+  - self: T11
+    fill_value: T2
+    dtype: T7
+    __ret_0: T7
+  - self: T11
+    fill_value: T2
+    dtype: T8
+    __ret_0: T8
+  - self: T11
+    fill_value: T2
+    dtype: T9
+    __ret_0: T9
+  - self: T11
+    fill_value: T2
+    dtype: T10
+    __ret_0: T10
+  - self: T11
+    fill_value: T2
+    dtype: T11
+    __ret_0: T11
+
+- func: aten::ge.Scalar
+  namespace: edge
+  inherits: aten::ge.Scalar
+  type_alias:
+    T0: [Bool]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Float, Int]
+    T4: [Byte]
+    T5: [Char]
+    T6: [Double]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+  type_constraint:
+  - self: T0
+    other: T3
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T0
     __ret_0: T0
   - self: T1
-    other: T8
+    other: T9
     __ret_0: T0
-  - self: T3
-    other: T2
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T4
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T5
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T6
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T7
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T8
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T9
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T10
-    other: T2
+    other: T3
+    __ret_0: T0
+  - self: T11
+    other: T3
     __ret_0: T0
 
 - func: aten::ge.Tensor
@@ -4144,14 +4329,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -4159,9 +4347,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -4169,22 +4354,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -4193,20 +4378,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::gelu
   namespace: edge
@@ -4232,51 +4426,52 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Float, Int]
-    T3: [Byte]
-    T4: [Char]
-    T5: [Double]
-    T6: [Float]
-    T7: [Half]
-    T8: [Int]
-    T9: [Long]
-    T10: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Float, Int]
+    T4: [Byte]
+    T5: [Char]
+    T6: [Double]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
   type_constraint:
   - self: T0
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T1
     other: T0
     __ret_0: T0
   - self: T1
-    other: T6
-    __ret_0: T0
-  - self: T1
-    other: T8
+    other: T9
     __ret_0: T0
-  - self: T3
-    other: T2
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T4
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T5
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T6
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T7
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T8
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T9
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T10
-    other: T2
+    other: T3
+    __ret_0: T0
+  - self: T11
+    other: T3
     __ret_0: T0
 
 - func: aten::gt.Tensor
@@ -4285,14 +4480,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -4300,9 +4498,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -4310,22 +4505,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -4334,20 +4529,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::hardtanh
   namespace: edge
@@ -4666,6 +4870,7 @@
     T7: [Int, Long]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     index: T7
@@ -4694,13 +4899,16 @@
   - self: T9
     index: T7
     __ret_0: T9
+  - self: T10
+    index: T7
+    __ret_0: T10
 
 - func: aten::isinf
   namespace: edge
   inherits: aten::isinf
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T1
     __ret_0: T0
@@ -4710,7 +4918,7 @@
   inherits: aten::isnan
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T1
     __ret_0: T0
@@ -4721,51 +4929,52 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Float, Int]
-    T3: [Byte]
-    T4: [Char]
-    T5: [Double]
-    T6: [Float]
-    T7: [Half]
-    T8: [Int]
-    T9: [Long]
-    T10: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Float, Int]
+    T4: [Byte]
+    T5: [Char]
+    T6: [Double]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
   type_constraint:
   - self: T0
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T1
     other: T0
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T9
     __ret_0: T0
-  - self: T1
-    other: T8
-    __ret_0: T0
-  - self: T3
-    other: T2
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T4
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T5
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T6
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T7
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T8
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T9
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T10
-    other: T2
+    other: T3
+    __ret_0: T0
+  - self: T11
+    other: T3
     __ret_0: T0
 
 - func: aten::le.Tensor
@@ -4774,14 +4983,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -4789,9 +5001,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -4799,22 +5008,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -4823,20 +5032,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::leaky_relu
   namespace: edge
@@ -4861,7 +5079,7 @@
   namespace: edge
   inherits: aten::lift_fresh_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -4870,7 +5088,7 @@
   namespace: edge
   inherits: aten::log
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -4885,14 +5103,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -4900,9 +5121,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -4910,22 +5128,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -4934,20 +5152,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::logical_not
   namespace: edge
@@ -4965,14 +5192,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -4980,9 +5210,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -4990,22 +5217,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -5014,20 +5241,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::logical_xor
   namespace: edge
@@ -5035,14 +5271,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -5050,9 +5289,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -5060,22 +5296,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -5084,26 +5320,35 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::logit
   namespace: edge
   inherits: aten::logit
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -5118,51 +5363,52 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Float, Int]
-    T3: [Byte]
-    T4: [Char]
-    T5: [Double]
-    T6: [Float]
-    T7: [Half]
-    T8: [Int]
-    T9: [Long]
-    T10: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Float, Int]
+    T4: [Byte]
+    T5: [Char]
+    T6: [Double]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
   type_constraint:
   - self: T0
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T1
     other: T0
     __ret_0: T0
   - self: T1
-    other: T6
-    __ret_0: T0
-  - self: T1
-    other: T8
+    other: T9
     __ret_0: T0
-  - self: T3
-    other: T2
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T4
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T5
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T6
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T7
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T8
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T9
-    other: T2
+    other: T3
     __ret_0: T0
   - self: T10
-    other: T2
+    other: T3
+    __ret_0: T0
+  - self: T11
+    other: T3
     __ret_0: T0
 
 - func: aten::lt.Tensor
@@ -5171,14 +5417,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -5186,9 +5435,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -5196,22 +5442,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -5220,20 +5466,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::masked_fill.Scalar
   namespace: edge
@@ -5371,7 +5626,7 @@
   namespace: edge
   inherits: aten::mean.dim
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T1: [Double]
     T2: [Float]
     T3: [Half]
@@ -5434,9 +5689,9 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T4: [Bool, Byte, Char, Half, Int, Long, Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T5: [Bool, Byte, Char, Int, Long, Short]
     T6: [Bool, Byte, Char, Int, Short]
     T7: [Bool, Byte, Char, Short]
@@ -5531,7 +5786,7 @@
   inherits: aten::mul.Scalar
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Float, Int, Long, Short]
+    T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Bool, Int]
     T4: [Bool, Long]
@@ -5543,6 +5798,7 @@
     T10: [Int]
     T11: [Long]
     T12: [Short]
+    T13: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -5577,20 +5833,23 @@
   - self: T12
     other: T3
     __ret_0: T12
+  - self: T13
+    other: T3
+    __ret_0: T13
 
 - func: aten::mul.Tensor
   namespace: edge
   inherits: aten::mul.Tensor
   type_alias:
-    T0: [Bool]
-    T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T4: [Bool, Byte, Char, Half, Int, Long, Short]
-    T5: [Bool, Byte, Char, Int, Long, Short]
-    T6: [Bool, Byte, Char, Int, Short]
-    T7: [Bool, Byte, Char, Short]
-    T8: [Bool, Char]
+    T0: [Bool, Byte]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
+    T4: [Bool, Byte, Char, Int, Long, Short]
+    T5: [Bool, Byte, Char, Int, Short]
+    T6: [Bool, Byte, Char, Short]
+    T7: [Bool, Char]
+    T8: [Bool, UInt16]
     T9: [Byte]
     T10: [Byte, Short]
     T11: [Char]
@@ -5603,34 +5862,34 @@
     T18: [Short]
   type_constraint:
   - self: T0
-    other: T0
-    __ret_0: T0
-  - self: T1
     other: T9
     __ret_0: T9
-  - self: T2
+  - self: T1
     other: T13
     __ret_0: T13
-  - self: T3
+  - self: T2
     other: T14
     __ret_0: T14
-  - self: T4
+  - self: T3
     other: T15
     __ret_0: T15
-  - self: T5
+  - self: T4
     other: T17
     __ret_0: T17
-  - self: T6
+  - self: T5
     other: T16
     __ret_0: T16
-  - self: T7
+  - self: T6
     other: T18
     __ret_0: T18
-  - self: T8
+  - self: T7
     other: T11
     __ret_0: T11
+  - self: T8
+    other: T8
+    __ret_0: T8
   - self: T9
-    other: T1
+    other: T0
     __ret_0: T9
   - self: T9
     other: T12
@@ -5639,7 +5898,7 @@
     other: T11
     __ret_0: T18
   - self: T11
-    other: T8
+    other: T7
     __ret_0: T11
   - self: T11
     other: T10
@@ -5648,22 +5907,22 @@
     other: T9
     __ret_0: T18
   - self: T13
-    other: T2
+    other: T1
     __ret_0: T13
   - self: T14
-    other: T3
+    other: T2
     __ret_0: T14
   - self: T15
-    other: T4
+    other: T3
     __ret_0: T15
   - self: T16
-    other: T6
+    other: T5
     __ret_0: T16
   - self: T17
-    other: T5
+    other: T4
     __ret_0: T17
   - self: T18
-    other: T7
+    other: T6
     __ret_0: T18
 
 - func: aten::native_layer_norm
@@ -5684,7 +5943,7 @@
   inherits: aten::ne.Scalar
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Byte]
     T4: [Char]
@@ -5694,6 +5953,7 @@
     T8: [Int]
     T9: [Long]
     T10: [Short]
+    T11: [UInt16]
   type_constraint:
   - self: T0
     other: T2
@@ -5731,6 +5991,9 @@
   - self: T10
     other: T2
     __ret_0: T0
+  - self: T11
+    other: T2
+    __ret_0: T0
 
 - func: aten::ne.Tensor
   namespace: edge
@@ -5738,14 +6001,17 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte]
-    T3: [Char]
-    T4: [Double]
-    T5: [Float]
-    T6: [Half]
-    T7: [Int]
-    T8: [Long]
-    T9: [Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte]
+    T4: [Char]
+    T5: [Double]
+    T6: [Double, Float, Half, UInt16]
+    T7: [Float]
+    T8: [Half]
+    T9: [Int]
+    T10: [Long]
+    T11: [Short]
+    T12: [UInt16]
   type_constraint:
   - self: T0
     other: T1
@@ -5753,9 +6019,6 @@
   - self: T1
     other: T0
     __ret_0: T0
-  - self: T1
-    other: T2
-    __ret_0: T0
   - self: T1
     other: T3
     __ret_0: T0
@@ -5763,22 +6026,22 @@
     other: T4
     __ret_0: T0
   - self: T1
-    other: T5
+    other: T9
     __ret_0: T0
   - self: T1
-    other: T6
+    other: T10
     __ret_0: T0
   - self: T1
-    other: T7
+    other: T11
     __ret_0: T0
-  - self: T1
-    other: T8
+  - self: T2
+    other: T5
     __ret_0: T0
-  - self: T1
-    other: T9
+  - self: T2
+    other: T7
     __ret_0: T0
   - self: T2
-    other: T1
+    other: T8
     __ret_0: T0
   - self: T3
     other: T1
@@ -5787,20 +6050,29 @@
     other: T1
     __ret_0: T0
   - self: T5
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T6
-    other: T1
+    other: T12
     __ret_0: T0
   - self: T7
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T8
-    other: T1
+    other: T2
     __ret_0: T0
   - self: T9
     other: T1
     __ret_0: T0
+  - self: T10
+    other: T1
+    __ret_0: T0
+  - self: T11
+    other: T1
+    __ret_0: T0
+  - self: T12
+    other: T6
+    __ret_0: T0
 
 - func: aten::neg
   namespace: edge
@@ -5825,7 +6097,7 @@
   namespace: edge
   inherits: aten::ones
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - dtype: T0
     __ret_0: T0
@@ -5834,7 +6106,7 @@
   namespace: edge
   inherits: aten::permute_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -5853,7 +6125,7 @@
   inherits: aten::pow.Tensor_Scalar
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Float, Int, Long, Short]
+    T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Bool, Float, Int]
     T3: [Bool, Int]
     T4: [Bool, Long]
@@ -5865,6 +6137,7 @@
     T10: [Int]
     T11: [Long]
     T12: [Short]
+    T13: [UInt16]
   type_constraint:
   - self: T0
     exponent: T0
@@ -5899,15 +6172,18 @@
   - self: T12
     exponent: T3
     __ret_0: T12
+  - self: T13
+    exponent: T0
+    __ret_0: T13
 
 - func: aten::pow.Tensor_Tensor
   namespace: edge
   inherits: aten::pow.Tensor_Tensor
   type_alias:
     T0: [Bool, Byte]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T4: [Bool, Byte, Char, Int, Long, Short]
     T5: [Bool, Byte, Char, Int, Short]
     T6: [Bool, Byte, Char, Short]
@@ -5988,7 +6264,7 @@
   namespace: edge
   inherits: aten::reciprocal
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -6010,7 +6286,7 @@
   namespace: edge
   inherits: aten::remainder.Scalar
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Bool, Float, Int]
     T2: [Bool, Int]
     T3: [Bool, Long]
@@ -6059,9 +6335,9 @@
   inherits: aten::remainder.Tensor
   type_alias:
     T0: [Bool, Byte]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T4: [Bool, Byte, Char, Int, Long, Short]
     T5: [Bool, Byte, Char, Int, Short]
     T6: [Bool, Byte, Char, Short]
@@ -6142,7 +6418,7 @@
   namespace: edge
   inherits: aten::repeat
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6151,7 +6427,7 @@
   namespace: edge
   inherits: aten::round
   type_alias:
-    T0: [Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6160,7 +6436,7 @@
   namespace: edge
   inherits: aten::rsqrt
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -6174,7 +6450,7 @@
   inherits: aten::rsub.Scalar
   type_alias:
     T0: [Byte]
-    T1: [Byte, Char, Float, Int, Long, Short]
+    T1: [Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Char]
     T3: [Double]
     T4: [Float]
@@ -6183,6 +6459,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     other: T4
@@ -6280,6 +6557,10 @@
     other: T7
     alpha: T7
     __ret_0: T9
+  - self: T10
+    other: T4
+    alpha: T5
+    __ret_0: T4
 
 - func: aten::scalar_tensor
   namespace: edge
@@ -6295,6 +6576,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - s: T1
     dtype: T0
@@ -6323,6 +6605,9 @@
   - s: T1
     dtype: T9
     __ret_0: T9
+  - s: T1
+    dtype: T10
+    __ret_0: T10
 
 - func: aten::scatter_add
   namespace: edge
@@ -6379,7 +6664,7 @@
   namespace: edge
   inherits: aten::select_copy.int
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6389,7 +6674,7 @@
   inherits: aten::select_scatter
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
     T3: [Char]
     T4: [Double]
@@ -6398,6 +6683,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     src: T1
@@ -6426,12 +6712,15 @@
   - self: T9
     src: T1
     __ret_0: T9
+  - self: T10
+    src: T1
+    __ret_0: T10
 
 - func: aten::sigmoid
   namespace: edge
   inherits: aten::sigmoid
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -6453,7 +6742,7 @@
   namespace: edge
   inherits: aten::sin
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -6466,7 +6755,7 @@
   namespace: edge
   inherits: aten::sinh
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -6479,7 +6768,7 @@
   namespace: edge
   inherits: aten::slice_copy.Tensor
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6489,7 +6778,7 @@
   inherits: aten::slice_scatter
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
     T3: [Char]
     T4: [Double]
@@ -6498,6 +6787,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     src: T1
@@ -6526,12 +6816,15 @@
   - self: T9
     src: T1
     __ret_0: T9
+  - self: T10
+    src: T1
+    __ret_0: T10
 
 - func: aten::split_copy.Tensor
   namespace: edge
   inherits: aten::split_copy.Tensor
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6540,7 +6833,7 @@
   namespace: edge
   inherits: aten::split_with_sizes_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6549,7 +6842,7 @@
   namespace: edge
   inherits: aten::sqrt
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -6562,7 +6855,7 @@
   namespace: edge
   inherits: aten::squeeze_copy.dim
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6571,7 +6864,7 @@
   namespace: edge
   inherits: aten::squeeze_copy.dims
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -6580,7 +6873,7 @@
   namespace: edge
   inherits: aten::stack
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - tensors: T0
     __ret_0: T0
@@ -6590,7 +6883,7 @@
   inherits: aten::sub.Scalar
   type_alias:
     T0: [Byte]
-    T1: [Byte, Char, Float, Int, Long, Short]
+    T1: [Byte, Char, Float, Int, Long, Short, UInt16]
     T2: [Char]
     T3: [Double]
     T4: [Float]
@@ -6599,6 +6892,7 @@
     T7: [Int]
     T8: [Long]
     T9: [Short]
+    T10: [UInt16]
   type_constraint:
   - self: T0
     other: T4
@@ -6696,15 +6990,19 @@
     other: T7
     alpha: T7
     __ret_0: T9
+  - self: T10
+    other: T4
+    alpha: T5
+    __ret_0: T4
 
 - func: aten::sub.Tensor
   namespace: edge
   inherits: aten::sub.Tensor
   type_alias:
     T0: [Byte]
-    T1: [Byte, Char, Double, Float, Half, Int, Long, Short]
-    T2: [Byte, Char, Float, Half, Int, Long, Short]
-    T3: [Byte, Char, Half, Int, Long, Short]
+    T1: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T2: [Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T3: [Byte, Char, Half, Int, Long, Short, UInt16]
     T4: [Byte, Char, Int, Long, Short]
     T5: [Byte, Char, Int, Short]
     T6: [Byte, Char, Short]
@@ -6718,6 +7016,7 @@
     T14: [Int]
     T15: [Long]
     T16: [Short]
+    T17: [UInt16]
   type_constraint:
   - self: T0
     other: T0
@@ -6843,6 +7142,10 @@
     other: T16
     alpha: T12
     __ret_0: T10
+  - self: T10
+    other: T17
+    alpha: T12
+    __ret_0: T10
   - self: T11
     other: T0
     alpha: T12
@@ -6883,6 +7186,10 @@
     other: T16
     alpha: T12
     __ret_0: T11
+  - self: T11
+    other: T17
+    alpha: T12
+    __ret_0: T11
   - self: T13
     other: T0
     alpha: T12
@@ -6923,6 +7230,10 @@
     other: T16
     alpha: T12
     __ret_0: T13
+  - self: T13
+    other: T17
+    alpha: T12
+    __ret_0: T13
   - self: T14
     other: T5
     alpha: T14
@@ -6971,13 +7282,25 @@
     other: T13
     alpha: T12
     __ret_0: T13
+  - self: T17
+    other: T10
+    alpha: T12
+    __ret_0: T10
+  - self: T17
+    other: T11
+    alpha: T12
+    __ret_0: T11
+  - self: T17
+    other: T13
+    alpha: T12
+    __ret_0: T13
 
 - func: aten::sum.dim_IntList
   namespace: edge
   inherits: aten::sum.dim_IntList
   type_alias:
     T0: [Bool]
-    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
     T2: [Byte]
     T3: [Char]
     T4: [Double]
@@ -7019,7 +7342,7 @@
   namespace: edge
   inherits: aten::t_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -7028,7 +7351,7 @@
   namespace: edge
   inherits: aten::tan
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -7041,7 +7364,7 @@
   namespace: edge
   inherits: aten::tanh
   type_alias:
-    T0: [Bool, Byte, Char, Float, Int, Long, Short]
+    T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16]
     T1: [Double, Half]
     T2: [Float]
   type_constraint:
@@ -7054,7 +7377,7 @@
   namespace: edge
   inherits: aten::transpose_copy.int
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -7072,7 +7395,7 @@
   namespace: edge
   inherits: aten::unbind_copy.int
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -7081,7 +7404,7 @@
   namespace: edge
   inherits: aten::unsqueeze_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -7099,7 +7422,7 @@
   namespace: edge
   inherits: aten::view_copy
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - self: T0
     __ret_0: T0
@@ -7110,9 +7433,9 @@
   type_alias:
     T0: [Bool]
     T1: [Bool, Byte]
-    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
-    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short]
-    T4: [Bool, Byte, Char, Half, Int, Long, Short]
+    T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
+    T3: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16]
+    T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16]
     T5: [Bool, Byte, Char, Int, Long, Short]
     T6: [Bool, Byte, Char, Int, Short]
     T7: [Bool, Byte, Char, Short]
@@ -7127,6 +7450,7 @@
     T16: [Int]
     T17: [Long]
     T18: [Short]
+    T19: [UInt16]
   type_constraint:
   - condition: T0
     self: T1
@@ -7352,6 +7676,10 @@
     self: T13
     other: T18
     __ret_0: T13
+  - condition: T1
+    self: T13
+    other: T19
+    __ret_0: T13
   - condition: T1
     self: T14
     other: T0
@@ -7388,6 +7716,10 @@
     self: T14
     other: T18
     __ret_0: T14
+  - condition: T1
+    self: T14
+    other: T19
+    __ret_0: T14
   - condition: T1
     self: T15
     other: T0
@@ -7424,6 +7756,10 @@
     self: T15
     other: T18
     __ret_0: T15
+  - condition: T1
+    self: T15
+    other: T19
+    __ret_0: T15
   - condition: T1
     self: T16
     other: T0
@@ -7532,6 +7868,18 @@
     self: T18
     other: T18
     __ret_0: T18
+  - condition: T1
+    self: T19
+    other: T13
+    __ret_0: T13
+  - condition: T1
+    self: T19
+    other: T14
+    __ret_0: T14
+  - condition: T1
+    self: T19
+    other: T15
+    __ret_0: T15
   - condition: T9
     self: T1
     other: T9
@@ -7617,7 +7965,7 @@
   namespace: edge
   inherits: aten::zeros
   type_alias:
-    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short]
+    T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16]
   type_constraint:
   - dtype: T0
     __ret_0: T0
diff --git a/exir/dialects/edge/test/test_edge_yaml.py b/exir/dialects/edge/test/test_edge_yaml.py
index 1571cf5b01..1ff9143420 100644
--- a/exir/dialects/edge/test/test_edge_yaml.py
+++ b/exir/dialects/edge/test/test_edge_yaml.py
@@ -187,6 +187,7 @@ def test_tensor_list_supported(self) -> None:
                     "Int",
                     "Long",
                     "Short",
+                    "UInt16",
                 )
             ],
         )

From a4def9f821880781069add5f1da09b6259f6b4bf Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Thu, 16 Jan 2025 09:48:23 +0100
Subject: [PATCH 16/40] Make ArmPassManager aware of TosaSpecification (#7668)

- Pass TosaSpecifcation to ArmPassManager. Based on this the
 PassManager can decide which passes should be run.
- Also adds docstrings and renames some passes.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
---
 backends/arm/_passes/arm_pass_manager.py      | 115 ++++++++++--------
 backends/arm/_passes/cast_int64_pass.py       |   6 +-
 .../fold_qdq_with_annotated_qparams_pass.py   |  13 +-
 .../_passes/meandim_to_averagepool_pass.py    |   4 +-
 backends/arm/_passes/remove_clone_pass.py     |   3 +-
 backends/arm/arm_backend.py                   |  23 ++--
 backends/arm/quantizer/arm_quantizer.py       |  11 +-
 backends/arm/test/common.py                   |  11 +-
 backends/arm/test/ops/test_avg_pool.py        |  12 +-
 backends/arm/test/ops/test_clone.py           |  13 +-
 backends/arm/test/ops/test_expand.py          |  16 +--
 backends/arm/test/ops/test_hardtanh.py        |  18 +--
 backends/arm/test/ops/test_max_pool.py        |  16 ++-
 backends/arm/test/ops/test_permute.py         |  15 ++-
 backends/arm/test/ops/test_relu.py            |  16 +--
 backends/arm/test/ops/test_repeat.py          |  14 +--
 backends/arm/test/ops/test_var.py             |  16 +--
 .../arm/test/passes/test_fold_qdq_pass.py     |  16 +--
 .../passes/test_meandim_to_averagepool2d.py   |   8 +-
 backends/arm/test/tester/arm_tester.py        |   6 +-
 examples/arm/aot_arm_compiler.py              |  54 +++++---
 21 files changed, 225 insertions(+), 181 deletions(-)

diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index 0b4e27e5aa..14972601b6 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -7,7 +7,6 @@
 
 # pyre-unsafe
 
-import torch
 from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import (
     AnnotateChannelsLastDimOrder,
 )
@@ -47,7 +46,7 @@
 )
 from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
-    ConvertMeanDimToAveragePool,
+    ConvertMeanDimToAveragePoolPass,
 )
 from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass
 from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
@@ -61,86 +60,98 @@
 from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
     UnsqueezeScalarPlaceholdersPass,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_manager import PassManager
+from torch.fx import GraphModule
 
 
 class ArmPassManager(PassManager):
 
-    def _transform(self, graph_module: torch.fx.GraphModule):
+    def __init__(self, tosa_spec: TosaSpecification) -> None:
+        self.tosa_spec = tosa_spec
+        super().__init__()
+
+    def _transform(self, graph_module: GraphModule):
         return self(graph_module).graph_module
 
-    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
-        """Apply passes before transforming program to backend"""
+    def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(FuseQuantizedActivationPass())
+        self.add_pass(RemoveGetItemPass())
+        self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(ConvertMeanDimToAveragePoolPass())
+
+        self.add_pass(AnnotateDecomposedMatmulPass())
+        self.add_pass(QuantizeFullArgument())
+        self.add_pass(FoldAndAnnotateQParamsPass())
+        self.add_pass(RetraceFoldedDtypesPass())
+        self.add_pass(InsertTableOpsPass(exported_program))
+
+        self.add_pass(RemoveClonePass())
+        self.add_pass(SizeAdjustConv2DPass())
+        self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(UnsqueezeBeforeRepeatPass())
+        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(Conv1dUnsqueezePass(exported_program))
+        self.add_pass(DecomposeSelectPass())
+
+        self.add_pass(AnnotateChannelsLastDimOrder())
+
+        return self._transform(exported_program.graph_module)
+
+    def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+
+        self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
+        self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
+        self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
-        self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeMeanDimPass())
-        self.add_pass(ConvertSplitToSlicePass())
-        self.add_pass(ConvertMmToBmmPass())
-        # TODO MLETORCH-558
+        self.add_pass(ConvertMeanDimToAveragePoolPass())
+        self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeSoftmaxesPass())
+
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeFullArgument())
-        self.add_pass(
-            FoldAndAnnotateQParamsPass(
-                [
-                    exir_ops.edge.aten.minimum.default,
-                    exir_ops.edge.aten.maximum.default,
-                    exir_ops.edge.aten.add.Tensor,
-                    exir_ops.edge.aten.avg_pool2d.default,
-                    exir_ops.edge.aten.bmm.default,
-                    exir_ops.edge.aten.cat.default,
-                    exir_ops.edge.aten.convolution.default,
-                    exir_ops.edge.aten.clone.default,
-                    exir_ops.edge.aten.exp.default,
-                    exir_ops.edge.aten.expand_copy.default,
-                    exir_ops.edge.aten.full.default,
-                    exir_ops.edge.aten.hardtanh.default,
-                    exir_ops.edge.aten.log.default,
-                    exir_ops.edge.aten.max_pool2d.default,
-                    exir_ops.edge.aten.mul.Tensor,
-                    exir_ops.edge.aten.permute_copy.default,
-                    exir_ops.edge.aten.reciprocal.default,
-                    exir_ops.edge.aten.relu.default,
-                    exir_ops.edge.aten.repeat.default,
-                    exir_ops.edge.aten.rsqrt.default,
-                    exir_ops.edge.aten.select_copy.int,
-                    exir_ops.edge.aten.sigmoid.default,
-                    exir_ops.edge.aten.slice_copy.Tensor,
-                    exir_ops.edge.aten.squeeze_copy.dims,
-                    exir_ops.edge.aten.sub.Tensor,
-                    exir_ops.edge.aten.sum.dim_IntList,
-                    exir_ops.edge.aten.tanh.default,
-                    exir_ops.edge.aten.unsqueeze_copy.default,
-                    exir_ops.edge.aten.upsample_nearest2d.vec,
-                    exir_ops.edge.aten.view_copy.default,
-                ]
-            )
-        )
+        self.add_pass(FoldAndAnnotateQParamsPass())
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
+
+        self.add_pass(RemoveClonePass())
+        self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(CastInt64ToInt32Pass(exported_program))
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
-        self.add_pass(SizeAdjustConv2DPass())
-        self.add_pass(RemoveClonePass())
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
-        self.add_pass(DecomposeDivPass())
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
-        self.add_pass(DecomposeSoftmaxesPass())
         self.add_pass(DecomposeSelectPass())
+
         self.add_pass(AnnotateChannelsLastDimOrder())
 
         return self._transform(exported_program.graph_module)
 
-    def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
+        """Apply passes before transforming program to backend"""
+        if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"):
+            return self._tosa_080_BI_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"):
+            return self._tosa_080_MI_pipeline(exported_program)
+        else:
+            raise NotImplementedError(
+                f"No pass pipeline implemented for {self.tosa_spec=}"
+            )
+
+    def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py
index aab6ed8eb4..dffa4c199a 100644
--- a/backends/arm/_passes/cast_int64_pass.py
+++ b/backends/arm/_passes/cast_int64_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -17,6 +17,10 @@
 
 
 class CastInt64ToInt32Pass(ExportPass):
+    """
+    Cast int64 buffers to int32 if the int64 data is in int32 range.
+    """
+
     def __init__(self, exported_program: torch.export.ExportedProgram):
         super(CastInt64ToInt32Pass, self).__init__()
         self.exported_program = exported_program
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
index 045506f19d..5a6b06d100 100644
--- a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
+++ b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -6,7 +6,7 @@
 
 import copy
 
-from typing import cast, Dict, Iterable, Set, Tuple
+from typing import cast, Dict, Set, Tuple
 
 from executorch.backends.arm.tosa_quant_utils import QuantArgs
 
@@ -55,7 +55,7 @@ def get_output_qparams(node: Node) -> dict[int, QuantArgs]:
 class FoldAndAnnotateQParamsPass(ExportPass):
     """
     A pass that walks the graph and removes any DQ and Q nodes before and after the target
-     node in the supplied list of operators.
+     node.
      The quantization parameters from the DQ/Q nodes are stored as meta values to be
      accessible for later lowering and serialization passes.
      The assumption is that the quantization annotatation adds DQ nodes for all tensor
@@ -82,9 +82,8 @@ class FoldAndAnnotateQParamsPass(ExportPass):
 
     """
 
-    def __init__(self, targeted_ops: Iterable[EdgeOpOverload]) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.targeted_ops = targeted_ops
 
     def fold_and_annotate_arg(
         self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
@@ -131,7 +130,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
         # Loop over the graph nodes and find any node in the 'targeted_ops' list.
         for n in graph_module.graph.nodes:
             n = cast(Node, n)
-            if n.op != "call_function" or n.target not in self.targeted_ops:
+            if n.op != "call_function":
                 continue
 
             # Make sure we haven't already set qparams meta information on the node
@@ -180,7 +179,7 @@ class QuantizeFullArgument(ExportPass):
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
-        # Loop over the graph nodes and find any node in the 'targeted_ops' list.
+        # Loop over the graph nodes and find full.default nodes.
         for n in graph_module.graph.nodes:
             n = cast(Node, n)
             if n.target != exir_ops.edge.aten.full.default:
diff --git a/backends/arm/_passes/meandim_to_averagepool_pass.py b/backends/arm/_passes/meandim_to_averagepool_pass.py
index 0974eac740..9a75519150 100644
--- a/backends/arm/_passes/meandim_to_averagepool_pass.py
+++ b/backends/arm/_passes/meandim_to_averagepool_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -16,7 +16,7 @@
 Argument = Any
 
 
-class ConvertMeanDimToAveragePool(ExportPass):
+class ConvertMeanDimToAveragePoolPass(ExportPass):
     """
     Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation.
     """
diff --git a/backends/arm/_passes/remove_clone_pass.py b/backends/arm/_passes/remove_clone_pass.py
index ac992ce2a0..9542a4097a 100644
--- a/backends/arm/_passes/remove_clone_pass.py
+++ b/backends/arm/_passes/remove_clone_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -11,6 +11,7 @@
 
 
 class RemoveClonePass(ExportPass):
+    """Remove all clones from graph_module"""
 
     def call_operator(self, op, args, kwargs, meta):
         if op != exir_ops.edge.aten.clone.default:
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 601cac3692..7bdbdf3947 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -50,7 +50,7 @@ def __init__(self):
         self.output_format = None
         self.path_for_intermediates = None
         self.quantize_io = False
-        self.tosa_version = None
+        self.tosa_spec = None
         self.input_order = None
 
     def ethosu_compile_spec(
@@ -92,11 +92,13 @@ def ethosu_compile_spec(
         if "u55" in config:
             # Add the Ethos-U55 extension marker
             base_tosa_version += "+u55"
-        self.tosa_version = TosaSpecification.create_from_string(base_tosa_version)
+        self.tosa_spec = TosaSpecification.create_from_string(base_tosa_version)
 
         return self
 
-    def tosa_compile_spec(self, tosa_version: str) -> "ArmCompileSpecBuilder":
+    def tosa_compile_spec(
+        self, tosa_spec: str | TosaSpecification
+    ) -> "ArmCompileSpecBuilder":
         """
         Generate compile spec for TOSA flatbuffer output
         """
@@ -104,7 +106,12 @@ def tosa_compile_spec(self, tosa_version: str) -> "ArmCompileSpecBuilder":
             self.output_format is None
         ), f"Output format already set: {self.output_format}"
         self.output_format = "tosa"
-        self.tosa_version = TosaSpecification.create_from_string(tosa_version)
+        if isinstance(tosa_spec, TosaSpecification):
+            self.tosa_spec = tosa_spec
+        elif isinstance(tosa_spec, str):
+            self.tosa_spec = TosaSpecification.create_from_string(tosa_spec)
+        else:
+            raise RuntimeError(f"Invalid type for {tosa_spec}!")
         return self
 
     def dump_intermediate_artifacts_to(
@@ -138,12 +145,10 @@ def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
         """
-        assert self.tosa_version
+        assert self.tosa_spec
 
         # Always supply a TOSA version
-        self.compile_spec = [
-            CompileSpec("tosa_version", str(self.tosa_version).encode())
-        ]
+        self.compile_spec = [CompileSpec("tosa_version", str(self.tosa_spec).encode())]
 
         if self.output_format == "vela":
             self.compile_spec += [
@@ -253,7 +258,7 @@ def preprocess(  # noqa: C901
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
-        graph_module = ArmPassManager().transform_to_backend_pipeline(
+        graph_module = ArmPassManager(tosa_spec).transform_to_backend_pipeline(
             exported_program=edge_program
         )
 
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index fe104db972..cba66cfe56 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -24,6 +24,7 @@
 from executorch.backends.arm.quantizer.quantization_annotator import annotate_graph
 
 from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from torch.ao.quantization.fake_quantize import (
     FakeQuantize,
     FusedMovingAvgObsFakeQuantize,
@@ -205,8 +206,10 @@ def not_module_type_or_name_filter(n: Node) -> bool:
 
 
 class ArmQuantizer(Quantizer):
-    def __init__(self) -> None:
+
+    def __init__(self, tosa_spec: TosaSpecification) -> None:
         super().__init__()
+        self.tosa_spec = tosa_spec
         self.global_config: Optional[QuantizationConfig] = None
         self.io_config: Optional[QuantizationConfig] = None
         self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {}
@@ -250,7 +253,9 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         Currently transforms scalar values to tensor attributes.
         """
 
-        return ArmPassManager().transform_for_annotation_pipeline(graph_module=model)
+        return ArmPassManager(self.tosa_spec).transform_for_annotation_pipeline(
+            graph_module=model
+        )
 
     def annotate(self, model: GraphModule) -> GraphModule:
         """Performs the quantization annotation on the graph.
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index bcd68cb173..c0f81bbe2e 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
@@ -53,15 +54,17 @@ def maybe_get_tosa_collate_path() -> str | None:
     return None
 
 
-def get_tosa_compile_spec(tosa_version: str, custom_path=None) -> list[CompileSpec]:
+def get_tosa_compile_spec(
+    tosa_spec: str | TosaSpecification, custom_path=None
+) -> list[CompileSpec]:
     """
     Default compile spec for TOSA tests.
     """
-    return get_tosa_compile_spec_unbuilt(tosa_version, custom_path).build()
+    return get_tosa_compile_spec_unbuilt(tosa_spec, custom_path).build()
 
 
 def get_tosa_compile_spec_unbuilt(
-    tosa_version: str, custom_path=None
+    tosa_spec: str | TosaSpecification, custom_path=None
 ) -> ArmCompileSpecBuilder:
     """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -73,7 +76,7 @@ def get_tosa_compile_spec_unbuilt(
         os.makedirs(custom_path, exist_ok=True)
     compile_spec_builder = (
         ArmCompileSpecBuilder()
-        .tosa_compile_spec(tosa_version)
+        .tosa_compile_spec(tosa_spec)
         .dump_intermediate_artifacts_to(custom_path)
         .set_quantize_io(True)
     )
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index bc37fbb136..16396950dc 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -18,6 +18,7 @@
 )
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -73,14 +74,14 @@ def _test_avgpool2d_tosa_MI_pipeline(
     def _test_avgpool2d_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -100,7 +101,8 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.tensor],
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         tester = (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 300ebb6f37..1d46173a68 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -19,6 +19,7 @@
 )
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 
@@ -60,13 +61,11 @@ def _test_clone_tosa_MI_pipeline(
     def _test_clone_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.clone.default": 1})
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index 915b1fe7e0..116f5d64e8 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -22,6 +22,7 @@
 )
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
@@ -64,13 +65,11 @@ def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tupl
         )
 
     def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.expand.default": 1})
@@ -85,7 +84,8 @@ def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl
     def _test_expand_ethosu_BI_pipeline(
         self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         tester = (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
index 7125920c8c..cf0a49827a 100644
--- a/backends/arm/test/ops/test_hardtanh.py
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -17,9 +17,10 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from parameterized import parameterized
 
@@ -71,13 +72,11 @@ def _test_hardtanh_tosa_MI_pipeline(
     def _test_hardtanh_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.hardtanh.default": 1})
@@ -93,7 +92,8 @@ def _test_hardtanh_tosa_BI_pipeline(
     def _test_hardtanh_tosa_ethosu_BI_pipeline(
         self, compile_spec, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         tester = (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index 81f27beab4..e3502baf2c 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -19,6 +19,7 @@
 )
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
@@ -86,15 +87,11 @@ def _test_maxpool2d_tosa_MI_pipeline(
     def _test_maxpool2d_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec(
-                    "TOSA-0.80+BI",
-                ),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.max_pool2d.default": 1})
@@ -118,7 +115,8 @@ def _test_maxpool2d_tosa_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.tensor],
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         tester = (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index ec7ecaa81b..f0bfe23cff 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -17,9 +17,9 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -74,13 +74,11 @@ def _test_permute_tosa_MI_pipeline(
     def _test_permute_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.permute.default": 1})
@@ -99,7 +97,8 @@ def _test_permute_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         tester = (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py
index 5a7bd4f5ec..dd2bc4817e 100644
--- a/backends/arm/test/ops/test_relu.py
+++ b/backends/arm/test/ops/test_relu.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -16,6 +16,7 @@
 )
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -64,13 +65,11 @@ def _test_relu_tosa_MI_pipeline(
     def _test_relu_tosa_BI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.relu.default": 1})
@@ -89,7 +88,8 @@ def _test_relu_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
index bad872792b..d35f699b72 100644
--- a/backends/arm/test/ops/test_repeat.py
+++ b/backends/arm/test/ops/test_repeat.py
@@ -19,6 +19,7 @@
 )
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
@@ -61,13 +62,11 @@ def _test_repeat_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tupl
         )
 
     def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .check_count({"torch.ops.aten.repeat.default": 1})
@@ -82,7 +81,8 @@ def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl
     def _test_repeat_ethosu_pipeline(
         self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index e1fed05817..fd45c2d83f 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -15,9 +15,10 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
 
@@ -112,13 +113,11 @@ def _test_var_tosa_BI_pipeline(
         test_data: torch.Tensor,
         target_str: str = None,
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI")
+        compile_spec = common.get_tosa_compile_spec(tosa_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
-            ArmTester(
-                module,
-                example_inputs=test_data,
-                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
-            )
+            ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
             .to_edge()
@@ -135,7 +134,8 @@ def _test_var_ethosu_BI_pipeline(
         test_data: torch.Tensor,
         target_str: str = None,
     ):
-        quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
+        tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+        quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config())
         (
             ArmTester(
                 module,
diff --git a/backends/arm/test/passes/test_fold_qdq_pass.py b/backends/arm/test/passes/test_fold_qdq_pass.py
index cd7cf75139..ebb96faf90 100644
--- a/backends/arm/test/passes/test_fold_qdq_pass.py
+++ b/backends/arm/test/passes/test_fold_qdq_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -16,8 +16,6 @@
 
 from executorch.backends.xnnpack.test.tester.tester import RunPasses
 
-from executorch.exir.dialects._ops import ops as exir_ops
-
 
 class SimpleQuantizeModel(torch.nn.Module):
     def forward(self, x, y):
@@ -27,16 +25,6 @@ def get_inputs(self):
         return (torch.rand(1, 1280, 7, 7), torch.rand(1, 1280, 7, 7))
 
 
-class FoldAndAnnotateQParamsPassTestClass(FoldAndAnnotateQParamsPass):
-    def __init__(self):
-        super(FoldAndAnnotateQParamsPassTestClass, self).__init__(
-            [
-                exir_ops.edge.aten.add.Tensor,
-                exir_ops.edge.aten.maximum.default,
-            ]
-        )
-
-
 class TestFoldAndAnnotateQParamsPass(unittest.TestCase):
     """
     Tests the FoldAndAnnotateQParamsPass which folds dq/q nodes into
@@ -49,7 +37,7 @@ def test_fold_qdq_pass(self):
         is removed from the representation.
         """
         module = SimpleQuantizeModel()
-        test_pass_stage = RunPasses([FoldAndAnnotateQParamsPassTestClass])
+        test_pass_stage = RunPasses([FoldAndAnnotateQParamsPass])
         (
             ArmTester(
                 module,
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
index 93badc6435..e07e91ed72 100644
--- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py
+++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -8,7 +8,7 @@
 
 import torch
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
-    ConvertMeanDimToAveragePool,
+    ConvertMeanDimToAveragePoolPass,
 )
 
 from executorch.backends.arm.test import common
@@ -41,7 +41,7 @@ class TestMeandimToAveragePool2dPass(unittest.TestCase):
 
     def test_tosa_BI_meandim_to_averagepool(self):
         module = MeanDim()
-        test_pass_stage = RunPasses([ConvertMeanDimToAveragePool])
+        test_pass_stage = RunPasses([ConvertMeanDimToAveragePoolPass])
         (
             ArmTester(
                 module,
@@ -58,7 +58,7 @@ def test_tosa_BI_meandim_to_averagepool(self):
 
     def test_tosa_BI_meandim_no_modification(self):
         module = MeanDim2()
-        test_pass_stage = RunPasses([ConvertMeanDimToAveragePool])
+        test_pass_stage = RunPasses([ConvertMeanDimToAveragePoolPass])
         (
             ArmTester(
                 module,
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index abb192e308..e5c700ec3c 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -33,6 +33,7 @@
     print_error_diffs,
 )
 from executorch.backends.arm.tosa_mapping import extract_tensor_meta
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.devtools.backend_debug import get_delegation_info
@@ -184,8 +185,11 @@ def __init__(
 
     def quantize(self, quantize_stage: Optional[tester.Quantize] = None):
         if quantize_stage is None:
+            tosa_spec: TosaSpecification = TosaSpecification.create_from_compilespecs(
+                compile_specs=self.compile_spec
+            )
             quantize_stage = tester.Quantize(
-                ArmQuantizer(),
+                ArmQuantizer(tosa_spec),
                 get_symmetric_quantization_config(is_per_channel=False),
             )
         return super().quantize(quantize_stage)
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 1208d79b06..bf7bbd87ef 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -16,12 +16,13 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
-from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
+from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder, CompileSpec
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
 from executorch.backends.arm.quantizer.arm_quantizer import (
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 
 from executorch.backends.arm.util.arm_model_evaluator import (
     GenericModelEvaluator,
@@ -88,6 +89,7 @@ def get_model_and_inputs_from_name(model_name: str) -> Tuple[torch.nn.Module, An
 def quantize(
     model: torch.nn.Module,
     model_name: str,
+    tosa_spec: TosaSpecification,
     example_inputs: Tuple[torch.Tensor],
     evaluator_name: str | None,
     evaluator_config: Dict[str, Any] | None,
@@ -95,7 +97,7 @@ def quantize(
     """This is the official recommended flow for quantization in pytorch 2.0 export"""
     logging.info("Quantizing Model...")
     logging.debug(f"Original model: {model}")
-    quantizer = ArmQuantizer()
+    quantizer = ArmQuantizer(tosa_spec)
 
     # if we set is_per_channel to True, we also need to add out_variant of quantize_per_channel/dequantize_per_channel
     operator_config = get_symmetric_quantization_config(is_per_channel=False)
@@ -260,7 +262,7 @@ def get_compile_spec(
     reorder_inputs: Optional[str] = None,
     system_config: Optional[str] = None,
     memory_mode: Optional[str] = None,
-) -> ArmCompileSpecBuilder:
+) -> list[CompileSpec]:
     spec_builder = None
     if target == "TOSA":
         spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI")
@@ -513,17 +515,6 @@ def get_args():
 
     # Quantize if required
     model_int8 = None
-    if args.quantize:
-        model = quantize(
-            model, args.model_name, example_inputs, args.evaluate, args.evaluate_config
-        )
-        model_int8 = model
-        # Wrap quantized model back into an exported_program
-        exported_program = torch.export.export_for_training(model, example_inputs)
-
-    if args.intermediates:
-        os.makedirs(args.intermediates, exist_ok=True)
-
     if args.delegate:
         # As we can target multiple output encodings from ArmBackend, one must
         # be specified.
@@ -534,6 +525,23 @@ def get_args():
             args.system_config,
             args.memory_mode,
         )
+        if args.quantize:
+            tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec)
+            model = quantize(
+                model,
+                args.model_name,
+                tosa_spec,
+                example_inputs,
+                args.evaluate,
+                args.evaluate_config,
+            )
+            model_int8 = model
+            # Wrap quantized model back into an exported_program
+            exported_program = torch.export.export_for_training(model, example_inputs)
+
+            if args.intermediates:
+                os.makedirs(args.intermediates, exist_ok=True)
+
         edge = to_edge_transform_and_lower(
             exported_program,
             partitioner=[ArmPartitioner(compile_spec)],
@@ -542,7 +550,25 @@ def get_args():
                 _skip_dim_order=True,
             ),
         )
+
     else:
+        if args.quantize:
+            tosa_spec = TosaSpecification.create_from_string("TOSA-0.80.0+BI")
+            model = quantize(
+                model,
+                args.model_name,
+                tosa_spec,
+                example_inputs,
+                args.evaluate,
+                args.evaluate_config,
+            )
+            model_int8 = model
+            # Wrap quantized model back into an exported_program
+            exported_program = torch.export.export_for_training(model, example_inputs)
+
+            if args.intermediates:
+                os.makedirs(args.intermediates, exist_ok=True)
+
         edge = to_edge_transform_and_lower(
             exported_program,
             compile_config=EdgeCompileConfig(

From 0dba025e9c03d8c081d1a1086e8919e2a71b9a90 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Thu, 16 Jan 2025 10:13:21 +0100
Subject: [PATCH 17/40] Remove incorrectly xfailing split tests (#7648)

Test failed due to checking for split op
when the test contained a split_with_sizes op.
Remove check.

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/arm/test/ops/test_split.py | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py
index a1ba53c881..b86e27f1a4 100644
--- a/backends/arm/test/ops/test_split.py
+++ b/backends/arm/test/ops/test_split.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -101,7 +101,6 @@ def _test_split_ethosu_BI_pipeline(
             )
             .quantize()
             .export()
-            .check(["torch.ops.aten.split.Tensor"])
             .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
@@ -129,33 +128,14 @@ def test_split_two_out_tosa_MI(self, test_data: test_data_t):
     def test_split_tosa_BI(self, test_data: test_data_t):
         self._test_split_tosa_BI_pipeline(self.Split(), test_data)
 
-    @parameterized.expand(
-        [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]]
-    )
+    @parameterized.expand(Split.test_data)
     def test_split_u55_BI(self, test_data: test_data_t):
         self._test_split_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.Split(), test_data
         )
 
-    # TODO MLETORCH-350
-    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
-    @unittest.expectedFailure
-    def test_split_u55_BI_skip(self, test_data: test_data_t):
-        self._test_split_ethosu_BI_pipeline(
-            common.get_u55_compile_spec(), self.Split(), test_data
-        )
-
-    @parameterized.expand(
-        [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]]
-    )
+    @parameterized.expand(Split.test_data)
     def test_split_u85_BI(self, test_data: test_data_t):
         self._test_split_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.Split(), test_data
         )
-
-    @parameterized.expand([Split.test_data[3], Split.test_data[5]])
-    @unittest.expectedFailure
-    def test_split_u85_BI_skip(self, test_data: test_data_t):
-        self._test_split_ethosu_BI_pipeline(
-            common.get_u85_compile_spec(), self.Split(), test_data
-        )

From d1b33cbbc0555935d9a0ac52f9ef7f0c1acfeccf Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Thu, 16 Jan 2025 14:40:54 +0100
Subject: [PATCH 18/40] Remove quantize_io from compile_spec (#7647)

quantize_io was only used in arm_partitioner
and is not needed there anymore when
running the delegate in the graph.

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
---
 backends/arm/arm_backend.py                  | 21 +-------------
 backends/arm/arm_partitioner.py              |  4 ---
 backends/arm/test/common.py                  |  9 ------
 backends/arm/test/ops/test_depthwise_conv.py | 30 +++++++-------------
 examples/arm/aot_arm_compiler.py             |  2 --
 5 files changed, 11 insertions(+), 55 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 7bdbdf3947..b4512f37af 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -49,8 +49,7 @@ def __init__(self):
         self.compiler_flags = []
         self.output_format = None
         self.path_for_intermediates = None
-        self.quantize_io = False
-        self.tosa_spec = None
+        self.tosa_version = None
         self.input_order = None
 
     def ethosu_compile_spec(
@@ -123,14 +122,6 @@ def dump_intermediate_artifacts_to(
         self.path_for_intermediates = output_path
         return self
 
-    def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
-        """
-        Quantization of inputs and dequantization of outputs for cases where
-        whole graph is quantized and method signature is not of quantized type.
-        """
-        self.quantize_io = quantize_io
-        return self
-
     def set_input_order(
         self, input_order: Optional[str] = None
     ) -> "ArmCompileSpecBuilder":
@@ -170,9 +161,6 @@ def build(self) -> List[CompileSpec]:
                 )
             )
 
-        if self.quantize_io:
-            self.compile_spec.append(CompileSpec("quantize_io", "True".encode()))
-
         return self.compile_spec
 
 
@@ -183,13 +171,6 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool:
     return False
 
 
-def is_quantize_io(compile_specs: List[CompileSpec]) -> bool:
-    for spec in compile_specs:
-        if spec.key == "quantize_io" and spec.value.decode() == "True":
-            return True
-    return False
-
-
 def get_tosa_version(compile_spec: List[CompileSpec]) -> TosaSpecification:
     for spec in compile_spec:
         if spec.key == "tosa_version":
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
index ef4589abf5..cc4058c4c5 100644
--- a/backends/arm/arm_partitioner.py
+++ b/backends/arm/arm_partitioner.py
@@ -12,7 +12,6 @@
 import torch
 from executorch.backends.arm.arm_backend import (
     ArmBackend,
-    is_quantize_io,
 )  # usort: skip
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
     TOSASupportedOperators,
@@ -89,9 +88,6 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                 node.meta["delegation_tag"] = tag
                 partition_tags[tag] = self.delegation_spec
 
-            if not is_quantize_io(self.delegation_spec.compile_specs):
-                continue
-
             # De-tag outmost q-nodes upwards and dq-nodes downwards.
             # De-tag if at least one input/ output is not part of partition.
             for node in partition.nodes:
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index c0f81bbe2e..eb97d9b1e7 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -78,14 +78,12 @@ def get_tosa_compile_spec_unbuilt(
         ArmCompileSpecBuilder()
         .tosa_compile_spec(tosa_spec)
         .dump_intermediate_artifacts_to(custom_path)
-        .set_quantize_io(True)
     )
 
     return compile_spec_builder
 
 
 def get_u55_compile_spec(
-    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> list[CompileSpec]:
@@ -93,14 +91,12 @@ def get_u55_compile_spec(
     Default compile spec for Ethos-U55 tests.
     """
     return get_u55_compile_spec_unbuilt(
-        quantize_io=quantize_io,
         custom_path=custom_path,
         reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u85_compile_spec(
-    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> list[CompileSpec]:
@@ -108,14 +104,12 @@ def get_u85_compile_spec(
     Default compile spec for Ethos-U85 tests.
     """
     return get_u85_compile_spec_unbuilt(
-        quantize_io=quantize_io,
         custom_path=custom_path,
         reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u55_compile_spec_unbuilt(
-    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> ArmCompileSpecBuilder:
@@ -133,7 +127,6 @@ def get_u55_compile_spec_unbuilt(
             memory_mode="Shared_Sram",
             extra_flags="--debug-force-regor --output-format=raw",
         )
-        .set_quantize_io(quantize_io)
         .dump_intermediate_artifacts_to(artifact_path)
         .set_input_order(reorder_inputs)
     )
@@ -141,7 +134,6 @@ def get_u55_compile_spec_unbuilt(
 
 
 def get_u85_compile_spec_unbuilt(
-    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> list[CompileSpec]:
@@ -157,7 +149,6 @@ def get_u85_compile_spec_unbuilt(
             memory_mode="Shared_Sram",
             extra_flags="--output-format=raw",
         )
-        .set_quantize_io(quantize_io)
         .dump_intermediate_artifacts_to(artifact_path)
         .set_input_order(reorder_inputs)
     )
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 22d9798aea..b8d69c89f1 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -259,46 +259,38 @@ def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
 
     @parameterized.expand(testsuite_conv2d[:4], skip_on_empty=True)
     @pytest.mark.corstone_fvp
-    def test_dw_conv2d_u55_BI(
-        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True
-    ):
+    def test_dw_conv2d_u55_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(quantize_io=set_quantize_io),
+            common.get_u55_compile_spec(),
             model.get_inputs(),
         )
 
     @parameterized.expand(testsuite_conv2d[4:], skip_on_empty=True)
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP  # TODO: MLETORCH-516
-    def test_dw_conv2d_u55_BI_xfails(
-        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
-    ):
+    def test_dw_conv2d_u55_BI_xfails(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(quantize_io=set_quantize_io),
+            common.get_u55_compile_spec(),
             model.get_inputs(),
         )
 
     @parameterized.expand(testsuite_conv1d, skip_on_empty=True)
     @pytest.mark.corstone_fvp
-    def test_dw_conv1d_u55_BI(
-        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True
-    ):
+    def test_dw_conv1d_u55_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u55_compile_spec(quantize_io=set_quantize_io),
+            common.get_u55_compile_spec(),
             model.get_inputs(),
         )
 
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d_u85)
     @pytest.mark.corstone_fvp
-    def test_dw_conv_u85_BI(
-        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True
-    ):
+    def test_dw_conv_u85_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u85_compile_spec(quantize_io=set_quantize_io),
+            common.get_u85_compile_spec(),
             model.get_inputs(),
         )
 
@@ -306,11 +298,9 @@ def test_dw_conv_u85_BI(
     @parameterized.expand(testsuite_conv2d_u85_xfails)
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
-    def test_dw_conv_u85_BI_xfails(
-        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True
-    ):
+    def test_dw_conv_u85_BI_xfails(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_ethos_BI_pipeline(
             model,
-            common.get_u85_compile_spec(quantize_io=set_quantize_io),
+            common.get_u85_compile_spec(),
             model.get_inputs(),
         )
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index bf7bbd87ef..e842cde6bb 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -275,7 +275,6 @@ def get_compile_spec(
                 memory_mode=memory_mode,
                 extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
             )
-            .set_quantize_io(True)
             .set_input_order(reorder_inputs)
         )
     elif "ethos-u85" in target:
@@ -287,7 +286,6 @@ def get_compile_spec(
                 memory_mode=memory_mode,
                 extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
             )
-            .set_quantize_io(True)
             .set_input_order(reorder_inputs)
         )
 

From fc6b83ee5f2d1c9d38519c371e8378c1c51bffad Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Thu, 16 Jan 2025 10:41:36 -0500
Subject: [PATCH 19/40] Move ai-edge-model-explorer into
 devtools/install_requirements.sh (#7675)

Summary:

From top-level requirements, move to devtools/install_requirement.sh instead.

ai-edge-model-explorer is too new and currently requires numpy<2 which conflicts with our recent upgrade.

Let's not take core dependency on this tool yet.
---
 .lintrunner.toml                                   |  1 +
 devtools/install_requirements.sh                   | 11 +++++++++++
 devtools/visualization/visualization_utils.py      |  9 ++++++++-
 devtools/visualization/visualization_utils_test.py |  9 ++++++++-
 install_requirements.py                            |  1 -
 pytest.ini                                         |  1 +
 6 files changed, 29 insertions(+), 3 deletions(-)
 create mode 100755 devtools/install_requirements.sh

diff --git a/.lintrunner.toml b/.lintrunner.toml
index fe8ecad1fc..dd75ea8f32 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -294,6 +294,7 @@ include_patterns = [
     'build/**/*.py',
     'codegen/**/*.py',
     # 'devtools/**/*.py',
+    'devtools/visualization/**/*.py',
     'docs/**/*.py',
     # 'examples/**/*.py',
     # 'exir/**/*.py',
diff --git a/devtools/install_requirements.sh b/devtools/install_requirements.sh
new file mode 100755
index 0000000000..242bc70257
--- /dev/null
+++ b/devtools/install_requirements.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Conflict: this requires numpy<2 whereas ExecuTorch core requires numpy>=2
+# Follow https://github.com/google-ai-edge/model-explorer/issues/277 for potential
+# resolution
+pip install ai-edge-model-explorer>=0.1.16
diff --git a/devtools/visualization/visualization_utils.py b/devtools/visualization/visualization_utils.py
index a2ee4c6050..4d520a6636 100644
--- a/devtools/visualization/visualization_utils.py
+++ b/devtools/visualization/visualization_utils.py
@@ -8,9 +8,16 @@
 import time
 
 from executorch.exir import EdgeProgramManager, ExecutorchProgramManager
-from model_explorer import config, consts, visualize_from_config  # type: ignore
 from torch.export.exported_program import ExportedProgram
 
+try:
+    from model_explorer import config, consts, visualize_from_config  # type: ignore
+except ImportError:
+    print(
+        "Error: 'model_explorer' is not installed. Install using devtools/install_requirement.sh"
+    )
+    raise
+
 
 class SingletonModelExplorerServer:
     """Singleton context manager for starting a model-explorer server.
diff --git a/devtools/visualization/visualization_utils_test.py b/devtools/visualization/visualization_utils_test.py
index 89781ab4f4..dafefa7dfd 100644
--- a/devtools/visualization/visualization_utils_test.py
+++ b/devtools/visualization/visualization_utils_test.py
@@ -17,7 +17,14 @@
     visualize,
 )
 from executorch.exir import ExportedProgram
-from model_explorer.config import ModelExplorerConfig  # type: ignore
+
+try:
+    from model_explorer.config import ModelExplorerConfig  # type: ignore
+except ImportError:
+    print(
+        "Error: 'model_explorer' is not installed. Install using devtools/install_requirement.sh"
+    )
+    raise
 
 
 @pytest.fixture
diff --git a/install_requirements.py b/install_requirements.py
index 26093cab84..adb26170cd 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -170,7 +170,6 @@ def python_is_compatible():
     "tomli",  # Imported by extract_sources.py when using python < 3.11.
     "wheel",  # For building the pip package archive.
     "zstd",  # Imported by resolve_buck.py.
-    "ai-edge-model-explorer>=0.1.16",  # For visualizing ExportedPrograms
 ]
 
 # Assemble the list of requirements to actually install.
diff --git a/pytest.ini b/pytest.ini
index b7e9afb9b9..d0c27fdfab 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -14,6 +14,7 @@ addopts =
     # explicitly list out tests that are running successfully in oss
     examples/models/test
     devtools/
+    --ignore=devtools/visualization/visualization_utils_test.py
     # examples
     examples/models/llama/tests
     examples/models/llama3_2_vision/preprocess

From 6d78026648dd3e167842bc53a83cb7b075d63d2f Mon Sep 17 00:00:00 2001
From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:52:00 +0000
Subject: [PATCH 20/40] Arm backend: Remove the reordering of inputs flag
 (#7698)

Remove the reordering of inputs flag

- With updated Vela version input/output order is now preserved.
- Remove re-order of inputs in compile spec
- Remove re-order of inputs in aot_arm_compiler
---
 backends/arm/arm_backend.py             | 10 -------
 backends/arm/test/common.py             |  8 ------
 backends/arm/test/test_arm_baremetal.sh |  4 +--
 examples/arm/aot_arm_compiler.py        | 38 +++++++------------------
 examples/arm/run.sh                     |  6 +---
 5 files changed, 13 insertions(+), 53 deletions(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index b4512f37af..979a246484 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -122,16 +122,6 @@ def dump_intermediate_artifacts_to(
         self.path_for_intermediates = output_path
         return self
 
-    def set_input_order(
-        self, input_order: Optional[str] = None
-    ) -> "ArmCompileSpecBuilder":
-        """
-        Reorder the inputs coming in. This may be required when inputs > 1.
-        And while using the U55/U85 CompileSpec.
-        """
-        self.input_order = input_order
-        return self
-
     def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index eb97d9b1e7..f1b9762572 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -85,33 +85,28 @@ def get_tosa_compile_spec_unbuilt(
 
 def get_u55_compile_spec(
     custom_path=None,
-    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U55 tests.
     """
     return get_u55_compile_spec_unbuilt(
         custom_path=custom_path,
-        reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u85_compile_spec(
     custom_path=None,
-    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """
     Default compile spec for Ethos-U85 tests.
     """
     return get_u85_compile_spec_unbuilt(
         custom_path=custom_path,
-        reorder_inputs=reorder_inputs,
     ).build()
 
 
 def get_u55_compile_spec_unbuilt(
     custom_path=None,
-    reorder_inputs=None,
 ) -> ArmCompileSpecBuilder:
     """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -128,14 +123,12 @@ def get_u55_compile_spec_unbuilt(
             extra_flags="--debug-force-regor --output-format=raw",
         )
         .dump_intermediate_artifacts_to(artifact_path)
-        .set_input_order(reorder_inputs)
     )
     return compile_spec
 
 
 def get_u85_compile_spec_unbuilt(
     custom_path=None,
-    reorder_inputs=None,
 ) -> list[CompileSpec]:
     """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify
     the compile spec before calling .build() to finalize it.
@@ -150,7 +143,6 @@ def get_u85_compile_spec_unbuilt(
             extra_flags="--output-format=raw",
         )
         .dump_intermediate_artifacts_to(artifact_path)
-        .set_input_order(reorder_inputs)
     )
     return compile_spec
 
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 377e1e2eb8..9f2fa4c17d 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -96,12 +96,12 @@ test_run_ethosu_fvp() { # End to End model tests
     # Ethos-U55
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55"
     examples/arm/run.sh --target=ethos-u55-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm --reorder_inputs=1,0,2
+    examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm
 
     # Ethos-U85
     echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85"
     examples/arm/run.sh --target=ethos-u85-128 --model_name=mv2
-    examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm --reorder_inputs=1,0,2
+    examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm
     }
 
 ${TEST_SUITE}
\ No newline at end of file
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index e842cde6bb..9563be93aa 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -259,7 +259,6 @@ def get_calibration_data(
 def get_compile_spec(
     target: str,
     intermediates: Optional[str] = None,
-    reorder_inputs: Optional[str] = None,
     system_config: Optional[str] = None,
     memory_mode: Optional[str] = None,
 ) -> list[CompileSpec]:
@@ -267,26 +266,18 @@ def get_compile_spec(
     if target == "TOSA":
         spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI")
     elif "ethos-u55" in target:
-        spec_builder = (
-            ArmCompileSpecBuilder()
-            .ethosu_compile_spec(
-                target,
-                system_config=system_config,
-                memory_mode=memory_mode,
-                extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
-            )
-            .set_input_order(reorder_inputs)
+        spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
+            target,
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
         )
     elif "ethos-u85" in target:
-        spec_builder = (
-            ArmCompileSpecBuilder()
-            .ethosu_compile_spec(
-                target,
-                system_config=system_config,
-                memory_mode=memory_mode,
-                extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
-            )
-            .set_input_order(reorder_inputs)
+        spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
+            target,
+            system_config=system_config,
+            memory_mode=memory_mode,
+            extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
         )
 
     if intermediates is not None:
@@ -429,14 +420,6 @@ def get_args():
         required=False,
         help="Location for outputs, if not the default of cwd.",
     )
-    parser.add_argument(
-        "-r",
-        "--reorder_inputs",
-        type=str,
-        required=False,
-        default=None,
-        help="Provide the order of the inputs. This can be required when inputs > 1.",
-    )
     parser.add_argument(
         "--system_config",
         required=False,
@@ -519,7 +502,6 @@ def get_args():
         compile_spec = get_compile_spec(
             args.target,
             args.intermediates,
-            args.reorder_inputs,
             args.system_config,
             args.memory_mode,
         )
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 5d492cfcb1..d47e2620e6 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -29,7 +29,6 @@ build_with_etdump=false
 build_type="Release"
 extra_build_flags=""
 build_only=false
-reorder_inputs=""
 system_config=""
 memory_mode=""
 
@@ -46,7 +45,6 @@ help() {
     echo "  --extra_build_flags                    Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --build_only                           Only build, don't run FVP"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default"
-    echo "  --reorder_inputs=<FLAGS>               Reorder the inputs. This can be required when inputs > 1."
     echo "  --system_config=<CONFIG>               System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
     echo "                                            NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
     echo "  --memory_mode=<MODE>                   Memory mode to select from the Vela configuration file (see vela.ini), e.g. Shared_Sram/Sram_Only. Default: 'Shared_Sram' for Ethos-U55 targets, 'Sram_Only' for Ethos-U85 targets"
@@ -66,7 +64,6 @@ for arg in "$@"; do
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --build_only) build_only=true ;;
       --scratch-dir=*) root_dir="${arg#*=}";;
-      --reorder_inputs=*) reorder_inputs="${arg#*=}";;
       --system_config=*) system_config="${arg#*=}";;
       --memory_mode=*) memory_mode="${arg#*=}";;
       *)
@@ -151,7 +148,7 @@ function generate_pte_file() {
     # We are using the aot_lib from build_quantization_aot_lib below
     SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT})
 
-    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
+    local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}"
     echo "CALL ${ARM_AOT_CMD}" >&2
     ${ARM_AOT_CMD} 1>&2
 
@@ -372,7 +369,6 @@ if [[ -z "$model_name" ]]; then
 else
     test_model=( "$model_name" )
     model_compiler_flags=( "$aot_arm_compiler_flags" )
-    reorder_inputs=( "$reorder_inputs" )
 fi
 
 # loop over running the AoT flow and executing the model on device

From af7613c7a5dd39e480aafc1146cd78f55d40bbbb Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Thu, 16 Jan 2025 09:59:32 -0800
Subject: [PATCH 21/40] [ExecuTorch][Llama] Split custom sdpa op and kv cache
 (#7412)

* [ExecuTorch][Llama] Split custom sdpa op and kv cache

Summary:
This enables us to do more easier module swap with model definitions
from torchtune

Test Plan:
CI

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]

* Update on "[ExecuTorch][Llama] Split custom sdpa op and kv cache"


Summary:
This enables us to do more easier module swap with model definitions
from torchtune

Test Plan:
CI

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 examples/models/llama/export_llama_lib.py     |  2 +
 .../quantized_kv_cache.py                     | 79 +++++++++++++++++--
 .../llama/source_transformation/sdpa.py       | 39 +++------
 .../test_sdpa_with_quantized_kv_cache.py      | 17 +++-
 4 files changed, 101 insertions(+), 36 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index a562bdf13f..69980990cf 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -56,6 +56,7 @@
     get_quant_weight_transform,
 )
 from .source_transformation.quantized_kv_cache import (
+    replace_kv_cache_with_custom_kv_cache,
     replace_kv_cache_with_quantized_kv_cache,
 )
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
@@ -1058,6 +1059,7 @@ def _get_source_transforms(  # noqa
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
     if args.use_sdpa_with_kv_cache:
+        transforms.append(replace_kv_cache_with_custom_kv_cache)
         transforms.append(replace_sdpa_with_custom_op)
 
     if args.quantize_kv_cache:
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
index a0c8c2fd93..d8ac99656f 100644
--- a/examples/models/llama/source_transformation/quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -6,6 +6,7 @@
 
 import logging
 from enum import Enum
+from typing import Tuple
 
 import torch
 import torch.nn as nn
@@ -44,7 +45,6 @@ def __init__(
             QuantizedCacheType.AffineSymmetric,
             QuantizedCacheType.AffineAsymmetric,
         ):
-
             raise ValueError(
                 f"Only affine symmetric and asymmetric cache types are supported: got {cache_type}"
             )
@@ -81,10 +81,11 @@ def __init__(
             )
 
     def _quantize(self, value):
-        scales, zero_points = (
-            torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
-                value, self.quantized_cache_dtype
-            )
+        (
+            scales,
+            zero_points,
+        ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
+            value, self.quantized_cache_dtype
         )
         quantized_value = torch.ops.quantized_decomposed.quantize_per_token(
             value,
@@ -262,3 +263,71 @@ def replace_kv_cache_with_quantized_kv_cache(module):
         else:
             replace_kv_cache_with_quantized_kv_cache(child)
     return module
+
+
+class CustomKVCache(nn.Module):
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.max_seq_length = max_seq_length
+        cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
+
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.register_buffer(
+            "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+        self.register_buffer(
+            "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu")
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # input_pos: [S], k_val: [B, S, H, D]
+        start_pos = input_pos[0].item()
+        _ = torch.ops.llama.update_cache(k_val, self.k_cache, start_pos)
+        _ = torch.ops.llama.update_cache(v_val, self.v_cache, start_pos)
+        return self.k_cache, self.v_cache
+
+
+def replace_kv_cache_with_custom_kv_cache(module):
+    r"""
+    Replace KVCache with CustomKVCache. This modifies the model in place.
+    At the moment custom kv cache only supports cache with shape
+    [B, S, H, D] as opposed to [B, H, S, D]
+    This is because the custom op treats second dim as sequence dim.
+    Future work: support [B, H, S, D]
+    """
+    logging.warning(
+        "Replacing KVCache with CustomKVCache. This modifies the model in place."
+    )
+    for name, child in module.named_children():
+        if isinstance(child, KVCache):
+            cache_shape = child.k_cache.shape
+            cache_dtype = child.k_cache.dtype
+            assert (
+                child.is_transposed is False
+            ), "CustomKVCache does not support transposed cache"
+            max_batch_size, max_seq_length, n_heads, head_dim = cache_shape
+            setattr(
+                module,
+                name,
+                CustomKVCache(
+                    max_batch_size,
+                    max_seq_length,
+                    n_heads,
+                    head_dim,
+                    dtype=cache_dtype,
+                ),
+            )
+        else:
+            replace_kv_cache_with_custom_kv_cache(child)
+    return module
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
index 59bfbe6f95..4d4b3bf7f5 100644
--- a/examples/models/llama/source_transformation/sdpa.py
+++ b/examples/models/llama/source_transformation/sdpa.py
@@ -56,33 +56,16 @@ def forward(
 
         k_cache = self.kv_cache.k_cache
         v_cache = self.kv_cache.v_cache
-        if hasattr(self.kv_cache, "quantized_cache_dtype"):
-            # updated quantize cache, scale and zero points
-            # returns dequantized kv cache
-            # Not most optimal. Optimizations to follow next
-            k_cache, v_cache = self.kv_cache.update(input_pos, k, v)
-            output = torch.ops.llama.custom_sdpa(
-                q,
-                k_cache,
-                v_cache,
-                input_pos[0].item(),
-                None,  # Attention mask
-                0,  # dropout probability. Ignored by the code
-                True,  # is_causal
-            )
-        else:
-            output = torch.ops.llama.sdpa_with_kv_cache(
-                q,
-                k,
-                v,
-                k_cache,
-                v_cache,
-                input_pos[0].item(),
-                seqlen,
-                None,  # Attention mask
-                0,  # dropout probability. Ignored by the code
-                True,  # is_causal
-            )
+        k_cache, v_cache = self.kv_cache.update(input_pos, k, v)
+        output = torch.ops.llama.custom_sdpa(
+            q,
+            k_cache,
+            v_cache,
+            input_pos[0].item(),
+            None,  # Attention mask
+            0,  # dropout probability. Ignored by the code
+            True,  # is_causal
+        )
         return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
@@ -106,7 +89,6 @@ def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
 
 
 class SDPASimple(torch.nn.Module):
-
     def __init__(
         self,
         kv_cache: KVCache,
@@ -166,7 +148,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 
 
 class SDPAFlex(torch.nn.Module):
-
     def __init__(
         self,
         kv_cache: KVCache,
diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
index 21952d8c21..57c36dabf9 100644
--- a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
+++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -11,6 +11,7 @@
 from executorch.examples.models.llama.llama_transformer import KVCache
 
 from executorch.examples.models.llama.source_transformation.quantized_kv_cache import (
+    CustomKVCache,
     QuantizedCacheType,
     QuantizedKVCache,
 )
@@ -19,7 +20,6 @@
 
 
 class SDPAWithQuantizedKVCacheTest(unittest.TestCase):
-
     def _init_cache(self):
         self.kv_cache = KVCache(
             self.max_batch_size,
@@ -33,6 +33,19 @@ def _init_cache(self):
         self.quantized_kv_cache = QuantizedKVCache.from_float(
             self.kv_cache, QuantizedCacheType.AffineAsymmetric
         )
+        # Need this because first test actually has seq_len > 1
+        # and vanilla kvcache cannot handle seq_len > 1, due to
+        # how input_pos encoding works in the current stack.
+        # This needs fixing by making sure rest of the stack including
+        # custom ops or other backends can work with input_pos
+        # as a sequence of token positions
+        self.custom_kv_cache = CustomKVCache(
+            self.max_batch_size,
+            self.max_seq_len,
+            self.n_kv_heads,
+            self.head_dim,
+            dtype=self.dtype,
+        )
 
     def _init_kv(self):
         kv_shape = (1, self.seq_len, self.n_kv_heads, self.head_dim)
@@ -59,7 +72,7 @@ def test_simple(self, is_dynamic_shape=False):
         self.seq_len = 3
         self._init_cache()
         q, k, v = self._init_kv()
-        self.float_sdpa = SDPACustom(self.kv_cache, self.dim)
+        self.float_sdpa = SDPACustom(self.custom_kv_cache, self.dim)
         self.quantized_sdpa = SDPACustom(self.quantized_kv_cache, self.dim)
         float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
         quantized_out = self.quantized_sdpa(input_pos, q, k, v, 1, self.seq_len, None)

From 745f17e27590677638bd838ce8a258097acf3e21 Mon Sep 17 00:00:00 2001
From: Hardik Sharma <hardiksharma@meta.com>
Date: Thu, 16 Jan 2025 10:31:39 -0800
Subject: [PATCH 22/40] Fix Graph builder for higher order ops.

Differential Revision: D68231732

Pull Request resolved: https://github.com/pytorch/executorch/pull/7684
---
 backends/cadence/aot/graph_builder.py         | 19 ++++++++++-
 .../cadence/aot/tests/test_graph_builder.py   | 33 ++++++++++++++++++-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py
index 88ed2ac769..fc9441891a 100644
--- a/backends/cadence/aot/graph_builder.py
+++ b/backends/cadence/aot/graph_builder.py
@@ -6,7 +6,8 @@
 from typing import Optional, Sequence, Union
 
 import torch
-from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
+from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
+from torch._dispatch.python import enable_python_dispatcher
 from torch._subclasses import FakeTensor, FakeTensorMode
 from torch.fx.node import Argument, Target
 from torch.utils import _pytree as pytree
@@ -80,6 +81,22 @@ def call_operator(
             kwargs = {}
         return super().call_operator(op, args, kwargs, meta)
 
+    def call_submodule(
+        self, graph_module: torch.fx.GraphModule, inputs: tuple[Argument, ...]
+    ) -> PassResult:
+        return ExportPass().call(graph_module)
+
+    def _fx(
+        self,
+        kind: str,
+        target: torch.fx.node.Target,
+        args: tuple[Argument, ...],
+        kwargs: dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        with self.fake_tensor_mode, enable_python_dispatcher():
+            return super()._fx(kind, target, args, kwargs, meta)
+
 
 def single_op_builder(
     placeholders: Sequence[Union[torch.Tensor, FakeTensor]],
diff --git a/backends/cadence/aot/tests/test_graph_builder.py b/backends/cadence/aot/tests/test_graph_builder.py
index 04097c1725..ebef97be52 100644
--- a/backends/cadence/aot/tests/test_graph_builder.py
+++ b/backends/cadence/aot/tests/test_graph_builder.py
@@ -1,5 +1,9 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 
+# pyre-strict
+
+
+from typing import Sequence
 
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
@@ -9,7 +13,7 @@
 )
 from executorch.backends.cadence.aot.pass_utils import count_node
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
+from executorch.exir.pass_base import ExportPass, NodeMetadata
 from later.unittest import TestCase
 
 
@@ -68,3 +72,30 @@ def test_graph_with_single_im2row(self) -> None:
         # Check graph has a single im2row node.
         self.assertEqual(len([gm.graph.nodes]), 1)
         self.assertEqual(count_node(gm, exir_ops.edge.cadence.im2row.default), 1)
+
+
+class TestHigherOrderOps(TestCase):
+    def _get_inner_graph(self, x_shape: Sequence[int]) -> torch.fx.GraphModule:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(*x_shape))
+        add = builder.call_operator(
+            exir_ops.edge.aten.add.Tensor,
+            (x, x),  # pyre-ignore
+        )
+        builder.output([x, add])
+        gm = builder.get_graph_module()
+        # Check if graph module is valid by running exportpass on it.
+        gm = ExportPass().call(gm).graph_module
+        return gm
+
+    def test_call_map(self) -> None:
+        builder = GraphBuilder()
+        x_shape = (4, 8, 8)
+        x = builder.placeholder("x", torch.randn(*x_shape))
+        map_node = builder.call_map(
+            self._get_inner_graph(x_shape[1:]), [x], [], NodeMetadata({})
+        )
+        builder.output([map_node])
+        gm = builder.get_graph_module()
+        # Check if graph module is valid by running exportpass on it.
+        ExportPass().call(gm).graph_module

From b8180626c129f9f11b33e4424aca7dfee6fca9df Mon Sep 17 00:00:00 2001
From: cccclai <chenlai@meta.com>
Date: Thu, 16 Jan 2025 13:03:04 -0800
Subject: [PATCH 23/40] add qnn test template job (#7636)

* add qnn test template job (#7636)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/7636

Differential Revision: D68112936

* Increase timeout

---------

Co-authored-by: Huy Do <huydhn@gmail.com>
---
 .github/workflows/pull.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index d1b64e7598..8b32e46cf2 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -395,6 +395,25 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
+  test-qnn-models-linux:
+    name: test-qnn-models-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 180
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # placeholder for running test_qnn_delegate.py, can use matrix such that we can trigger different jobs, refers to test-llama-runner-qnn-linux
+        # reminder: make sure each job runs fast
+
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main

From 9c043290ad3944268290e015c3063bc411e6ef6b Mon Sep 17 00:00:00 2001
From: Gasoonjia <gasoonjia@meta.com>
Date: Thu, 16 Jan 2025 13:07:16 -0800
Subject: [PATCH 24/40] support half and bf16 in to_dim_order_copy (#7689)

Differential Revision: D68245619

Pull Request resolved: https://github.com/pytorch/executorch/pull/7693
---
 .../portable/cpu/op__to_dim_order_copy.cpp    |  8 ++++++--
 kernels/test/op__to_dim_order_copy_test.cpp   | 19 +++++++++++++++----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
index 31dd4fbb9d..bcbf6cc132 100644
--- a/kernels/portable/cpu/op__to_dim_order_copy.cpp
+++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -96,13 +96,17 @@ Tensor& _to_dim_order_copy_out(
       InvalidArgument,
       out);
 
-  ET_SWITCH_REALHB_TYPES(
+  if (self.numel() == 0) {
+    return out;
+  }
+
+  ET_SWITCH_REALHBBF16_TYPES(
       self.scalar_type(),
       ctx,
       "dim_order_ops::_to_dim_order_copy.out",
       CTYPE_IN,
       [&] {
-        ET_SWITCH_REALHB_TYPES(
+        ET_SWITCH_REALHBBF16_TYPES(
             out.scalar_type(),
             ctx,
             "dim_order_ops::_to_dim_order_copy.out",
diff --git a/kernels/test/op__to_dim_order_copy_test.cpp b/kernels/test/op__to_dim_order_copy_test.cpp
index e888e0fc7f..073225a7d6 100644
--- a/kernels/test/op__to_dim_order_copy_test.cpp
+++ b/kernels/test/op__to_dim_order_copy_test.cpp
@@ -36,7 +36,9 @@ typedef std::map<
           std::type_index,
           std::variant<
             std::vector<float>,
-            std::vector<double>>>
+            std::vector<double>,
+            std::vector<exec_aten::Half>,
+            std::vector<exec_aten::BFloat16>>>
         FloatingTypeToDataMap;
 
 typedef std::map<
@@ -381,9 +383,9 @@ TEST_F(OpToDimOrderCopyTest, NanInfSupported) {
       ScalarType::OUTPUT_DTYPE>(test_cases);
 
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
-  ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
+  ET_FORALL_FLOATHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
 
 #undef TEST_ENTRY
 #undef TEST_KERNEL
@@ -413,6 +415,13 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) {
       -0.30919688936285893988};
   // clang-format on
 
+  std::vector<exec_aten::Half> half_data;
+  std::vector<exec_aten::BFloat16> bf16_data;
+  for (auto d : double_data) {
+    half_data.emplace_back(d);
+    bf16_data.emplace_back(d);
+  }
+
   std::vector<int64_t> int64_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
   std::vector<int32_t> int32_data = {
@@ -426,6 +435,8 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) {
   FloatingTypeToDataMap floating_point_data;
   floating_point_data[typeid(float)] = float_data;
   floating_point_data[typeid(double)] = double_data;
+  floating_point_data[typeid(exec_aten::Half)] = half_data;
+  floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data;
 
   // Gathering all int data together for better traversial
   IntTypeToDataMap int_data;
@@ -444,7 +455,7 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) {
 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
   ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);
 
-  ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
 }
 
 TEST_F(OpToDimOrderCopyTest, MismatchedSizesDie) {

From 1b7b10efd1d61f4e96a129ea416dcc6bcc2c3f2c Mon Sep 17 00:00:00 2001
From: Nicholas Long <19273992+cptspacemanspiff@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:07:36 -0500
Subject: [PATCH 25/40] Fixed always rebuild issue in cmake. (#7512)

* Fixed always rebuild issue in cmake.

The generated files were located in include/executorch/schema/program_generated.h
CMake was expecting files in include/executorch/program_generated.h

Presumably this was a change at some point, and the expected output from cmake never got updated.
---
 schema/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt
index 5a4013f43e..64f8821da1 100644
--- a/schema/CMakeLists.txt
+++ b/schema/CMakeLists.txt
@@ -15,7 +15,7 @@ endif()
 
 # The include directory that will contain the generated schema headers.
 set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
-
+set(_program_schema__output_dir "${_program_schema__include_dir}/executorch/schema")
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
@@ -26,7 +26,7 @@ function(generate_program_schema _schema_srcs _schema_name)
   foreach(fbs_file ${_schema_srcs})
     string(REGEX REPLACE "[.]fbs$" "_generated.h" generated "${fbs_file}")
     list(APPEND _schema_outputs
-         "${_program_schema__include_dir}/executorch/${generated}"
+         "${_program_schema__output_dir}/${generated}"
     )
   endforeach()
 
@@ -35,7 +35,7 @@ function(generate_program_schema _schema_srcs _schema_name)
     OUTPUT ${_schema_outputs}
     COMMAND
       ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o
-      "${_program_schema__include_dir}/executorch/schema" ${_schema_srcs}
+      "${_program_schema__output_dir}" ${_schema_srcs}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     DEPENDS ${FLATC_EXECUTABLE} ${_schema_srcs}
     COMMENT "Generating ${_schema_name} headers"

From 2f0518d2cfb4ee4353dce4e39590de43fa391399 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 16 Jan 2025 18:03:54 -0600
Subject: [PATCH 26/40] [xnnpack] Reexport after quantize in aot_compiler
 (#7714)

---
 examples/xnnpack/aot_compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index c3538db4d8..e1542245ac 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -92,6 +92,7 @@
         logging.info("Quantizing Model...")
         # TODO(T165162973): This pass shall eventually be folded into quantizer
         model = quantize(model, example_inputs)
+        ep = torch.export.export_for_training(model, example_inputs)
 
     edge = to_edge_transform_and_lower(
         ep,

From 007ea3ed0cec116f669cca389a5f23679f7da74c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 16 Jan 2025 16:37:29 -0800
Subject: [PATCH 27/40] install_requirements.py: use argparse, minor cleanup
 (#7703)

Replace the bespoke argument parser with argparse in preparation for separating requirements installation from building and installing ExecuTorch itself.
---
 install_requirements.py | 347 ++++++++++++++++++++++------------------
 1 file changed, 188 insertions(+), 159 deletions(-)

diff --git a/install_requirements.py b/install_requirements.py
index adb26170cd..c16cacca46 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -6,7 +6,9 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import argparse
 import glob
+import itertools
 import os
 import platform
 import re
@@ -63,174 +65,201 @@ def python_is_compatible():
     return True
 
 
-if not python_is_compatible():
-    sys.exit(1)
+def clean():
+    print("Cleaning build artifacts...")
+    print("Cleaning pip-out/...")
+    shutil.rmtree("pip-out/", ignore_errors=True)
+    dirs = glob.glob("cmake-out*/") + glob.glob("cmake-android-out/")
+    for d in dirs:
+        print(f"Cleaning {d}...")
+        shutil.rmtree(d, ignore_errors=True)
+    print("Done cleaning build artifacts.")
 
-# Parse options.
 
-EXECUTORCH_BUILD_PYBIND = ""
-CMAKE_ARGS = os.getenv("CMAKE_ARGS", "")
-CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "")
-USE_PYTORCH_NIGHTLY = True
+VALID_PYBINDS = ["coreml", "mps", "xnnpack"]
 
-args = sys.argv[1:]
-for arg in args:
-    if arg == "--pybind":
-        pass
-    elif arg in ["coreml", "mps", "xnnpack"]:
-        if "--pybind" in args:
-            arg_upper = arg.upper()
-            EXECUTORCH_BUILD_PYBIND = "ON"
-            CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{arg_upper}=ON"
-        else:
-            print(f"Error: {arg} must follow --pybind")
-            sys.exit(1)
-    elif arg == "off":
-        if "--pybind" in args:
-            if EXECUTORCH_BUILD_PYBIND == "ON":
-                print("Cannot turnoff pybind option as it is already set.")
-                sys.exit(1)
+
+def main(args):
+    if not python_is_compatible():
+        sys.exit(1)
+
+    # Parse options.
+
+    EXECUTORCH_BUILD_PYBIND = ""
+    CMAKE_ARGS = os.getenv("CMAKE_ARGS", "")
+    CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "")
+    USE_PYTORCH_NIGHTLY = True
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pybind",
+        action="append",
+        nargs="+",
+        help="one or more of coreml/mps/xnnpack, or off",
+    )
+    parser.add_argument(
+        "--clean",
+        action="store_true",
+        help="clean build artifacts and pip-out instead of installing",
+    )
+    parser.add_argument(
+        "--use-pt-pinned-commit",
+        action="store_true",
+        help="build from the pinned PyTorch commit instead of nightly",
+    )
+    args = parser.parse_args(args)
+
+    if args.clean:
+        clean()
+        return
+
+    if args.pybind:
+        # Flatten list of lists.
+        args.pybind = list(itertools.chain(*args.pybind))
+        if "off" in args.pybind:
+            if len(args.pybind) != 1:
+                raise Exception(
+                    f"Cannot combine `off` with other pybinds: {args.pybind}"
+                )
             EXECUTORCH_BUILD_PYBIND = "OFF"
         else:
-            print(f"Error: {arg} must follow --pybind")
-            sys.exit(1)
-
-    elif arg == "--clean":
-        print("Cleaning build artifacts...")
-        print("Cleaning pip-out/...")
-        shutil.rmtree("pip-out/", ignore_errors=True)
-        dirs = glob.glob("cmake-out*/") + glob.glob("cmake-android-out/")
-        for d in dirs:
-            print(f"Cleaning {d}...")
-            shutil.rmtree(d, ignore_errors=True)
-        print("Done cleaning build artifacts.")
-        sys.exit(0)
-    elif arg == "--use-pt-pinned-commit":
+            for pybind_arg in args.pybind:
+                if pybind_arg not in VALID_PYBINDS:
+                    raise Exception(
+                        f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}"
+                    )
+                EXECUTORCH_BUILD_PYBIND = "ON"
+                CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON"
+
+    if args.use_pt_pinned_commit:
         # This option is used in CI to make sure that PyTorch build from the pinned commit
         # is used instead of nightly. CI jobs wouldn't be able to catch regression from the
         # latest PT commit otherwise
         USE_PYTORCH_NIGHTLY = False
-    else:
-        print(f"Error: Unknown option {arg}")
-        sys.exit(1)
 
-# If --pybind is not set explicitly for backends (e.g., --pybind xnnpack)
-# or is not turned off explicitly (--pybind off)
-# then install XNNPACK by default.
-if EXECUTORCH_BUILD_PYBIND == "":
-    EXECUTORCH_BUILD_PYBIND = "ON"
-    CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON"
-
-# Use ClangCL on Windows.
-# ClangCL is an alias to Clang that configures it to work in an MSVC-compatible
-# mode. Using it on Windows to avoid compiler compatibility issues for MSVC.
-if os.name == "nt":
-    CMAKE_ARGS += " -T ClangCL"
-
-# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
-# pip versions will have the required features.
-#
-# NOTE: If a newly-fetched version of the executorch repo changes the value of
-# NIGHTLY_VERSION, you should re-run this script to install the necessary
-# package versions.
-NIGHTLY_VERSION = "dev20250104"
-
-# The pip repository that hosts nightly torch packages.
-TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
-
-# pip packages needed by exir.
-EXIR_REQUIREMENTS = [
-    # Setting USE_PYTORCH_NIGHTLY to false to test the pinned PyTorch commit. Note
-    # that we don't need to set any version number there because they have already
-    # been installed on CI before this step, so pip won't reinstall them
-    f"torch==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torch",
-    (
-        f"torchvision==0.22.0.{NIGHTLY_VERSION}"
-        if USE_PYTORCH_NIGHTLY
-        else "torchvision"
-    ),  # For testing.
-    "typing-extensions",
-]
-
-# pip packages needed to run examples.
-# TODO: Make each example publish its own requirements.txt
-EXAMPLES_REQUIREMENTS = [
-    "timm==1.0.7",
-    f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio",
-    "torchsr==1.0.4",
-    "transformers==4.47.1",
-]
-
-# pip packages needed for development.
-DEVEL_REQUIREMENTS = [
-    "cmake",  # For building binary targets.
-    "pip>=23",  # For building the pip package.
-    "pyyaml",  # Imported by the kernel codegen tools.
-    "setuptools>=63",  # For building the pip package.
-    "tomli",  # Imported by extract_sources.py when using python < 3.11.
-    "wheel",  # For building the pip package archive.
-    "zstd",  # Imported by resolve_buck.py.
-]
-
-# Assemble the list of requirements to actually install.
-# TODO: Add options for reducing the number of requirements.
-REQUIREMENTS_TO_INSTALL = EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS
-
-# Install the requirements. `--extra-index-url` tells pip to look for package
-# versions on the provided URL if they aren't available on the default URL.
-subprocess.run(
-    [
-        sys.executable,
-        "-m",
-        "pip",
-        "install",
-        *REQUIREMENTS_TO_INSTALL,
-        "--extra-index-url",
-        TORCH_NIGHTLY_URL,
-    ],
-    check=True,
-)
-
-LOCAL_REQUIREMENTS = [
-    "third-party/ao",  # We need the latest kernels for fast iteration, so not relying on pypi.
-]
-
-# Install packages directly from local copy instead of pypi.
-# This is usually not recommended.
-subprocess.run(
-    [
-        sys.executable,
-        "-m",
-        "pip",
-        "install",
-        *LOCAL_REQUIREMENTS,
-    ],
-    check=True,
-)
+    # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack)
+    # or is not turned off explicitly (--pybind off)
+    # then install XNNPACK by default.
+    if EXECUTORCH_BUILD_PYBIND == "":
+        EXECUTORCH_BUILD_PYBIND = "ON"
+        CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON"
+
+    # Use ClangCL on Windows.
+    # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible
+    # mode. Using it on Windows to avoid compiler compatibility issues for MSVC.
+    if os.name == "nt":
+        CMAKE_ARGS += " -T ClangCL"
+
+    # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
+    # pip versions will have the required features.
+    #
+    # NOTE: If a newly-fetched version of the executorch repo changes the value of
+    # NIGHTLY_VERSION, you should re-run this script to install the necessary
+    # package versions.
+    NIGHTLY_VERSION = "dev20250104"
+
+    # The pip repository that hosts nightly torch packages.
+    TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
+
+    # pip packages needed by exir.
+    EXIR_REQUIREMENTS = [
+        # Setting USE_PYTORCH_NIGHTLY to false to test the pinned PyTorch commit. Note
+        # that we don't need to set any version number there because they have already
+        # been installed on CI before this step, so pip won't reinstall them
+        f"torch==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torch",
+        (
+            f"torchvision==0.22.0.{NIGHTLY_VERSION}"
+            if USE_PYTORCH_NIGHTLY
+            else "torchvision"
+        ),  # For testing.
+        "typing-extensions",
+    ]
+
+    # pip packages needed to run examples.
+    # TODO: Make each example publish its own requirements.txt
+    EXAMPLES_REQUIREMENTS = [
+        "timm==1.0.7",
+        f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio",
+        "torchsr==1.0.4",
+        "transformers==4.47.1",
+    ]
+
+    # pip packages needed for development.
+    DEVEL_REQUIREMENTS = [
+        "cmake",  # For building binary targets.
+        "pip>=23",  # For building the pip package.
+        "pyyaml",  # Imported by the kernel codegen tools.
+        "setuptools>=63",  # For building the pip package.
+        "tomli",  # Imported by extract_sources.py when using python < 3.11.
+        "wheel",  # For building the pip package archive.
+        "zstd",  # Imported by resolve_buck.py.
+    ]
+
+    # Assemble the list of requirements to actually install.
+    # TODO: Add options for reducing the number of requirements.
+    REQUIREMENTS_TO_INSTALL = (
+        EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS
+    )
+
+    # Install the requirements. `--extra-index-url` tells pip to look for package
+    # versions on the provided URL if they aren't available on the default URL.
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            *REQUIREMENTS_TO_INSTALL,
+            "--extra-index-url",
+            TORCH_NIGHTLY_URL,
+        ],
+        check=True,
+    )
+
+    LOCAL_REQUIREMENTS = [
+        "third-party/ao",  # We need the latest kernels for fast iteration, so not relying on pypi.
+    ]
+
+    # Install packages directly from local copy instead of pypi.
+    # This is usually not recommended.
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            *LOCAL_REQUIREMENTS,
+        ],
+        check=True,
+    )
+
+    #
+    # Install executorch pip package. This also makes `flatc` available on the path.
+    # The --extra-index-url may be necessary if pyproject.toml has a dependency on a
+    # pre-release or nightly version of a torch package.
+    #
+
+    # Set environment variables
+    os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND
+    os.environ["CMAKE_ARGS"] = CMAKE_ARGS
+    os.environ["CMAKE_BUILD_ARGS"] = CMAKE_BUILD_ARGS
+
+    # Run the pip install command
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            ".",
+            "--no-build-isolation",
+            "-v",
+            "--extra-index-url",
+            TORCH_NIGHTLY_URL,
+        ],
+        check=True,
+    )
 
-#
-# Install executorch pip package. This also makes `flatc` available on the path.
-# The --extra-index-url may be necessary if pyproject.toml has a dependency on a
-# pre-release or nightly version of a torch package.
-#
 
-# Set environment variables
-os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND
-os.environ["CMAKE_ARGS"] = CMAKE_ARGS
-os.environ["CMAKE_BUILD_ARGS"] = CMAKE_BUILD_ARGS
-
-# Run the pip install command
-subprocess.run(
-    [
-        sys.executable,
-        "-m",
-        "pip",
-        "install",
-        ".",
-        "--no-build-isolation",
-        "-v",
-        "--extra-index-url",
-        TORCH_NIGHTLY_URL,
-    ],
-    check=True,
-)
+if __name__ == "__main__":
+    main(sys.argv[1:])

From 9f47380ab5d4818270a7ea2eac13d9a4caa76dd0 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Thu, 16 Jan 2025 16:41:28 -0800
Subject: [PATCH 28/40] install_requirements.py: refactor: extract
 install_requirements() function (#7704)

More preparation for separating installation of requirements from installation of ExecuTorch.

Test Plan: ./install_requirements.sh in a fresh venv succeeded and reported installing executorch
---
 install_requirements.py | 169 +++++++++++++++++++++-------------------
 1 file changed, 87 insertions(+), 82 deletions(-)

diff --git a/install_requirements.py b/install_requirements.py
index c16cacca46..409460ca10 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -79,97 +79,29 @@ def clean():
 VALID_PYBINDS = ["coreml", "mps", "xnnpack"]
 
 
-def main(args):
-    if not python_is_compatible():
-        sys.exit(1)
-
-    # Parse options.
-
-    EXECUTORCH_BUILD_PYBIND = ""
-    CMAKE_ARGS = os.getenv("CMAKE_ARGS", "")
-    CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "")
-    USE_PYTORCH_NIGHTLY = True
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pybind",
-        action="append",
-        nargs="+",
-        help="one or more of coreml/mps/xnnpack, or off",
-    )
-    parser.add_argument(
-        "--clean",
-        action="store_true",
-        help="clean build artifacts and pip-out instead of installing",
-    )
-    parser.add_argument(
-        "--use-pt-pinned-commit",
-        action="store_true",
-        help="build from the pinned PyTorch commit instead of nightly",
-    )
-    args = parser.parse_args(args)
-
-    if args.clean:
-        clean()
-        return
-
-    if args.pybind:
-        # Flatten list of lists.
-        args.pybind = list(itertools.chain(*args.pybind))
-        if "off" in args.pybind:
-            if len(args.pybind) != 1:
-                raise Exception(
-                    f"Cannot combine `off` with other pybinds: {args.pybind}"
-                )
-            EXECUTORCH_BUILD_PYBIND = "OFF"
-        else:
-            for pybind_arg in args.pybind:
-                if pybind_arg not in VALID_PYBINDS:
-                    raise Exception(
-                        f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}"
-                    )
-                EXECUTORCH_BUILD_PYBIND = "ON"
-                CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON"
+# The pip repository that hosts nightly torch packages.
+TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
 
-    if args.use_pt_pinned_commit:
-        # This option is used in CI to make sure that PyTorch build from the pinned commit
-        # is used instead of nightly. CI jobs wouldn't be able to catch regression from the
-        # latest PT commit otherwise
-        USE_PYTORCH_NIGHTLY = False
 
-    # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack)
-    # or is not turned off explicitly (--pybind off)
-    # then install XNNPACK by default.
-    if EXECUTORCH_BUILD_PYBIND == "":
-        EXECUTORCH_BUILD_PYBIND = "ON"
-        CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON"
-
-    # Use ClangCL on Windows.
-    # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible
-    # mode. Using it on Windows to avoid compiler compatibility issues for MSVC.
-    if os.name == "nt":
-        CMAKE_ARGS += " -T ClangCL"
-
-    # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
-    # pip versions will have the required features.
-    #
-    # NOTE: If a newly-fetched version of the executorch repo changes the value of
-    # NIGHTLY_VERSION, you should re-run this script to install the necessary
-    # package versions.
-    NIGHTLY_VERSION = "dev20250104"
+# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
+# pip versions will have the required features.
+#
+# NOTE: If a newly-fetched version of the executorch repo changes the value of
+# NIGHTLY_VERSION, you should re-run this script to install the necessary
+# package versions.
+NIGHTLY_VERSION = "dev20250104"
 
-    # The pip repository that hosts nightly torch packages.
-    TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
 
+def install_requirements(use_pytorch_nightly):
     # pip packages needed by exir.
     EXIR_REQUIREMENTS = [
-        # Setting USE_PYTORCH_NIGHTLY to false to test the pinned PyTorch commit. Note
+        # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torch",
+        f"torch==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
         (
             f"torchvision==0.22.0.{NIGHTLY_VERSION}"
-            if USE_PYTORCH_NIGHTLY
+            if use_pytorch_nightly
             else "torchvision"
         ),  # For testing.
         "typing-extensions",
@@ -179,7 +111,7 @@ def main(args):
     # TODO: Make each example publish its own requirements.txt
     EXAMPLES_REQUIREMENTS = [
         "timm==1.0.7",
-        f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio",
+        f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio",
         "torchsr==1.0.4",
         "transformers==4.47.1",
     ]
@@ -233,6 +165,79 @@ def main(args):
         check=True,
     )
 
+
+def main(args):
+    if not python_is_compatible():
+        sys.exit(1)
+
+    # Parse options.
+
+    EXECUTORCH_BUILD_PYBIND = ""
+    CMAKE_ARGS = os.getenv("CMAKE_ARGS", "")
+    CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "")
+    use_pytorch_nightly = True
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pybind",
+        action="append",
+        nargs="+",
+        help="one or more of coreml/mps/xnnpack, or off",
+    )
+    parser.add_argument(
+        "--clean",
+        action="store_true",
+        help="clean build artifacts and pip-out instead of installing",
+    )
+    parser.add_argument(
+        "--use-pt-pinned-commit",
+        action="store_true",
+        help="build from the pinned PyTorch commit instead of nightly",
+    )
+    args = parser.parse_args(args)
+    if args.pybind:
+        # Flatten list of lists.
+        args.pybind = list(itertools.chain(*args.pybind))
+        if "off" in args.pybind:
+            if len(args.pybind) != 1:
+                raise Exception(
+                    f"Cannot combine `off` with other pybinds: {args.pybind}"
+                )
+            EXECUTORCH_BUILD_PYBIND = "OFF"
+        else:
+            for pybind_arg in args.pybind:
+                if pybind_arg not in VALID_PYBINDS:
+                    raise Exception(
+                        f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}"
+                    )
+                EXECUTORCH_BUILD_PYBIND = "ON"
+                CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON"
+
+    if args.clean:
+        clean()
+        return
+
+    if args.use_pt_pinned_commit:
+        # This option is used in CI to make sure that PyTorch build from the pinned commit
+        # is used instead of nightly. CI jobs wouldn't be able to catch regression from the
+        # latest PT commit otherwise
+        use_pytorch_nightly = False
+
+    install_requirements(use_pytorch_nightly)
+
+    # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack)
+    # or is not turned off explicitly (--pybind off)
+    # then install XNNPACK by default.
+    if EXECUTORCH_BUILD_PYBIND == "":
+        EXECUTORCH_BUILD_PYBIND = "ON"
+        CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON"
+
+    # Use ClangCL on Windows.
+    # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible
+    # mode. Using it on Windows to avoid compiler compatibility issues for MSVC.
+    if os.name == "nt":
+        CMAKE_ARGS += " -T ClangCL"
+
     #
     # Install executorch pip package. This also makes `flatc` available on the path.
     # The --extra-index-url may be necessary if pyproject.toml has a dependency on a

From fbb0395110724717c42720582bb8804b752241e3 Mon Sep 17 00:00:00 2001
From: Dave Bort <dbort@users.noreply.github.com>
Date: Thu, 16 Jan 2025 17:59:38 -0800
Subject: [PATCH 29/40] Validate tensor sizes during method load

Differential Revision: D68180029

Pull Request resolved: https://github.com/pytorch/executorch/pull/7663
---
 runtime/executor/tensor_parser_portable.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
index 3f190060f7..79e4c4bd96 100644
--- a/runtime/executor/tensor_parser_portable.cpp
+++ b/runtime/executor/tensor_parser_portable.cpp
@@ -101,6 +101,19 @@ Result<Tensor> parseTensor(
     sizes = const_cast<exec_aten::SizesType*>(serialized_sizes);
     dim_order = const_cast<exec_aten::DimOrderType*>(serialized_dim_order);
   }
+  // Validate sizes before using them in case the PTE data is bad. We can't
+  // detect bad positive values, but we can reject negative values, which would
+  // otherwise panic in the TensorImpl ctor. dim_order_to_stride() will validate
+  // dim_order.
+  for (int i = 0; i < dim; i++) {
+    ET_CHECK_OR_RETURN_ERROR(
+        sizes[i] >= 0,
+        InvalidProgram,
+        "Negative size[%d] %" PRId32,
+        i,
+        sizes[i]);
+  }
+
   // We will remove strides from schema.
   // Allocating strides buffer here and populating it.
   // In subsequent diffs we can remove strides accessor, however this

From 1a6b7a6f14c75d87b21c4fc517b0d7c0fe17f761 Mon Sep 17 00:00:00 2001
From: JP <46308822+zonglinpeng@users.noreply.github.com>
Date: Thu, 16 Jan 2025 22:08:59 -0800
Subject: [PATCH 30/40] refactor test targets

Differential Revision: D68194772

Pull Request resolved: https://github.com/pytorch/executorch/pull/7673
---
 examples/cadence/operators/TARGETS            | 25 ++-----------
 examples/cadence/operators/targets.bzl        | 36 +++++++++++++++++++
 ...nv1d_op.py => test_quantized_conv1d_op.py} |  4 ++-
 ...near_op.py => test_quantized_linear_op.py} |  0
 4 files changed, 41 insertions(+), 24 deletions(-)
 create mode 100644 examples/cadence/operators/targets.bzl
 rename examples/cadence/operators/{quantized_conv1d_op.py => test_quantized_conv1d_op.py} (93%)
 rename examples/cadence/operators/{quantized_linear_op.py => test_quantized_linear_op.py} (100%)

diff --git a/examples/cadence/operators/TARGETS b/examples/cadence/operators/TARGETS
index 732f1ced09..67f2bab681 100644
--- a/examples/cadence/operators/TARGETS
+++ b/examples/cadence/operators/TARGETS
@@ -1,26 +1,5 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("targets.bzl", "define_common_targets")
 
 oncall("odai_jarvis")
 
-
-python_unittest(
-    name = "test_add_op",
-    srcs = [
-        "test_add_op.py",
-    ],
-    typing = True,
-    supports_static_listing = False,
-    deps = [
-        "fbsource//third-party/pypi/parameterized:parameterized",
-        "//caffe2:torch",
-        "//executorch/backends/cadence/aot:ops_registrations",
-        "//executorch/backends/cadence/aot:export_example",
-        "//executorch/backends/cadence/aot:compiler",
-    ],
-)
+define_common_targets()
diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl
new file mode 100644
index 0000000000..e1fbeb9fdf
--- /dev/null
+++ b/examples/cadence/operators/targets.bzl
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+TESTS_LIST = [
+    "add_op",
+    "quantized_conv1d_op",
+    "quantized_linear_op",
+]
+
+def define_common_targets():
+    for op in TESTS_LIST:
+        _define_test_target(op)
+
+
+def _define_test_target(test_name):
+    file_name = "test_{}".format(test_name)
+    python_unittest(
+        name = file_name,
+        srcs = [
+            "{}.py".format(file_name),
+        ],
+        typing = True,
+        supports_static_listing = False,
+        deps = [
+            "fbsource//third-party/pypi/parameterized:parameterized",
+            "fbcode//caffe2:torch",
+            "fbcode//executorch/backends/cadence/aot:ops_registrations",
+            "fbcode//executorch/backends/cadence/aot:export_example",
+            "fbcode//executorch/backends/cadence/aot:compiler",
+        ],
+    )
diff --git a/examples/cadence/operators/quantized_conv1d_op.py b/examples/cadence/operators/test_quantized_conv1d_op.py
similarity index 93%
rename from examples/cadence/operators/quantized_conv1d_op.py
rename to examples/cadence/operators/test_quantized_conv1d_op.py
index 3247cb690d..e2457077b2 100644
--- a/examples/cadence/operators/quantized_conv1d_op.py
+++ b/examples/cadence/operators/test_quantized_conv1d_op.py
@@ -8,6 +8,8 @@
 
 import logging
 
+from typing import cast, Sequence
+
 import torch
 
 from executorch.backends.cadence.aot.ops_registrations import *  # noqa
@@ -53,6 +55,6 @@ def forward(self, x: torch.Tensor):
     model = QuantizedConv()
     model.eval()
 
-    example_inputs = (torch.randn(shape),)
+    example_inputs = (torch.randn(cast(Sequence[int], shape)),)
 
     export_model(model, example_inputs)
diff --git a/examples/cadence/operators/quantized_linear_op.py b/examples/cadence/operators/test_quantized_linear_op.py
similarity index 100%
rename from examples/cadence/operators/quantized_linear_op.py
rename to examples/cadence/operators/test_quantized_linear_op.py

From dad73ca6240429e2f79d666547cd61c95c05c427 Mon Sep 17 00:00:00 2001
From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com>
Date: Fri, 17 Jan 2025 08:57:38 +0000
Subject: [PATCH 31/40] Fix for multiple outputs in FVP tests (#7650)

Fix for multiple outputs in corstone

- Update to ensure all output nodes are consumed.
- Update to ensure output quant scales are used.
---
 .../arm/test/misc/test_multiple_outputs.py    | 47 ++++++++++-
 backends/arm/test/runner_utils.py             | 79 ++++++++++---------
 .../arm/test/tester/analyze_output_utils.py   |  8 +-
 backends/arm/test/tester/arm_tester.py        | 37 +++++----
 4 files changed, 114 insertions(+), 57 deletions(-)

diff --git a/backends/arm/test/misc/test_multiple_outputs.py b/backends/arm/test/misc/test_multiple_outputs.py
index 7762c7dc2f..ddddc94d27 100644
--- a/backends/arm/test/misc/test_multiple_outputs.py
+++ b/backends/arm/test/misc/test_multiple_outputs.py
@@ -6,9 +6,11 @@
 
 import unittest
 
+import pytest
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
 class TestMultipleOutputs(unittest.TestCase):
@@ -51,3 +53,46 @@ def test_tosa_BI_pipeline(self):
             .to_executorch()
             .run_method_and_compare_outputs(inputs=inputs, qtol=1.0)
         )
+
+    def _test_ethosu_BI_pipeline(
+        self,
+        module: torch.nn.Module,
+        test_data: tuple[torch.Tensor],
+        compile_spec: CompileSpec,
+    ):
+        tester = (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=compile_spec,
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
+    @pytest.mark.corstone_fvp
+    def test_u85_BI(self):
+        module = self.MultipleOutputsModule()
+        test_data = module.get_inputs()
+        self._test_ethosu_BI_pipeline(
+            module,
+            test_data,
+            common.get_u85_compile_spec(),
+        )
+
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    # TODO MLETORCH-598
+    def test_u55_BI(self):
+        module = self.MultipleOutputsModule()
+        test_data = module.get_inputs()
+        self._test_ethosu_BI_pipeline(
+            module,
+            test_data,
+            common.get_u55_compile_spec(),
+        )
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index b206e5585b..3851e41b73 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -115,50 +115,53 @@ def _get_input_quantization_params(
     return quant_params
 
 
-def _get_output_node(program: ExportedProgram) -> Node:
+def _get_output_nodes(program: ExportedProgram) -> list[Node]:
     """
     Get output node to this model.
 
     Args:
-        program (ExportedProgram): The program to get output node from.
+        program (ExportedProgram): The program to get the output nodes from.
     Returns:
-        The node that is the output of 'program'.
+        The nodes that are the outputs of the 'program'.
     """
-
+    output_nodes = []
     for node in program.graph.nodes:
         if node.op == "output":
-            return node
-    raise RuntimeError("No output node found.")
+            for output in node.args[0]:
+                output_nodes.append(output)
+    if len(output_nodes) == 0:
+        raise RuntimeError("No output nodes found.")
+    else:
+        return output_nodes
 
 
 def _get_output_quantization_params(
-    program: ExportedProgram, output_node: Node
-) -> Optional[QuantizationParams]:
+    output_nodes: list[Node],
+) -> List[QuantizationParams]:
     """
     Get output QuantizationParams from a program.
     Args:
-        program (ExportedProgram): The program to get output quantization parameters from.
+        output_nodes (list(Node)): A list of output nodes to get output quantization parameters from.
     Returns:
         QuantizationParams: The found quantization parameters.
     Raises:
         RuntimeError if no output quantization parameters are found.
     """
-
-    quant_params = None
-    for node in program.graph.nodes:
-        if (
-            node.target == torch.ops.quantized_decomposed.dequantize_per_tensor.default
-            and node == output_node.args[0][0]
-        ):
-            quant_params = QuantizationParams(
-                node_name=node.args[0].name,
-                scale=node.args[1],
-                zp=node.args[2],
-                qmin=node.args[3],
-                qmax=node.args[4],
-                dtype=node.args[5],
+    quant_params = []
+    for node in output_nodes:
+        if node.target == torch.ops.quantized_decomposed.dequantize_per_tensor.default:
+            quant_params.append(
+                QuantizationParams(
+                    node_name=node.args[0].name,
+                    scale=node.args[1],
+                    zp=node.args[2],
+                    qmin=node.args[3],
+                    qmax=node.args[4],
+                    dtype=node.args[5],
+                )
             )
-            break  # break early, there's only one output node
+    if len(quant_params) == 0:
+        raise RuntimeError("No Quantization parameters not found in exported model.")
     return quant_params
 
 
@@ -211,7 +214,7 @@ def __init__(
         self.input_names: list[str] = None
         self.output_name: str = None
         self.qp_input: list[QuantizationParams] = None
-        self.qp_output: QuantizationParams = None
+        self.qp_output: list[QuantizationParams] = None
         self.timeout = 480
         self.target_board: str = None
 
@@ -226,19 +229,17 @@ def init_run(
     ):
 
         self.input_names = _get_input_names(edge_program)
-        self.output_node = _get_output_node(exported_program)
-        self.output_name = self.output_node.name
+        self.output_nodes = _get_output_nodes(exported_program)
+
         self.is_quantized = is_quantized
         self.target_board = target_board
 
         if is_quantized:
             self.qp_input = _get_input_quantization_params(exported_program)
-            self.qp_output = _get_output_quantization_params(
-                exported_program, self.output_node
-            )
+            self.qp_output = _get_output_quantization_params(self.output_nodes)
         else:
             self.qp_input = [None] * len(self.input_names)
-            self.qp_output = None
+            self.qp_output = [None] * len(self.output_nodes)
 
         self._has_init_run = True
 
@@ -265,7 +266,7 @@ def run_corstone(
             save_bytes(self.intermediate_path, data, False, input_name, quant_param)
 
         out_path = os.path.join(self.intermediate_path, "out")
-        out_path_with_suffix = out_path + "-0.bin"
+
         input_paths = []
         for name in self.input_names:
             input_paths.append(
@@ -281,6 +282,7 @@ def run_corstone(
         ), f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?"
 
         cmd_line = f"executor_runner -m {pte_path} -o {out_path}"
+
         for input_path in input_paths:
             cmd_line += f" -i {input_path}"
 
@@ -362,11 +364,14 @@ def run_corstone(
             raise RuntimeError(
                 f"Corstone simulation failed:\ncmd: {command_args[self.target_board]}\n, log: \n {result_stdout}\n{result.stderr.decode()}"
             )
-
-        tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
-        output_shape = self.output_node.args[0][0].meta["val"].shape
-        tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape)
-        return tosa_ref_output
+        output_np = []
+        for i, node in enumerate(self.output_nodes):
+            tosa_ref_output = np.fromfile(
+                os.path.join(self.intermediate_path, f"out-{i}.bin"), dtype=np.float32
+            )
+            output_shape = node.meta["val"].shape
+            output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape))
+        return tuple(output_np)
 
     def run_tosa_graph(
         self, graph: TosaGraph, inputs: list[np.ndarray] | list[torch.Tensor]
diff --git a/backends/arm/test/tester/analyze_output_utils.py b/backends/arm/test/tester/analyze_output_utils.py
index d70f86c4f2..477a96652f 100644
--- a/backends/arm/test/tester/analyze_output_utils.py
+++ b/backends/arm/test/tester/analyze_output_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,7 +9,7 @@
 import torch
 from executorch.backends.arm.test.runner_utils import (
     _get_input_quantization_params,
-    _get_output_node,
+    _get_output_nodes,
     _get_output_quantization_params,
 )
 
@@ -228,9 +228,9 @@ def dump_error_output(
     export_stage = tester.stages.get(tester.stage_name(Export), None)
     quantize_stage = tester.stages.get(tester.stage_name(Quantize), None)
     if export_stage is not None and quantize_stage is not None:
-        output_node = _get_output_node(export_stage.artifact)
+        output_nodes = _get_output_nodes(export_stage.artifact)
         qp_input = _get_input_quantization_params(export_stage.artifact)
-        qp_output = _get_output_quantization_params(export_stage.artifact, output_node)
+        qp_output = _get_output_quantization_params(output_nodes)
         logger.error(f"Input QuantArgs: {qp_input}")
         logger.error(f"Output QuantArgs: {qp_output}")
 
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index e5c700ec3c..5b2f9201fc 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -14,6 +14,7 @@
 import serializer.tosa_serializer as ts
 
 import torch.fx
+import torch.utils._pytree as pytree
 
 from executorch.backends.arm.arm_backend import get_intermediate_path
 from executorch.backends.arm.arm_partitioner import ArmPartitioner
@@ -302,6 +303,7 @@ def run_method_and_compare_outputs(
 
         exported_program = self.stages[self.stage_name(tester.Export)].artifact
         edge_program = edge_stage.artifact.exported_program()
+
         self.runner_util.init_run(
             exported_program,
             edge_program,
@@ -309,14 +311,14 @@ def run_method_and_compare_outputs(
             target_board,
         )
 
-        quantization_scale = None
         if is_quantized:
             reference_stage = self.stages[self.stage_name(tester.Quantize)]
             # bool output is quantized with none quantized output so allow
             # self.runner_util.qp_output to be none
             if self.runner_util.qp_output is not None:
-                quantization_scale = self.runner_util.qp_output.scale
+                quantization_scales = [qp.scale for qp in self.runner_util.qp_output]
         else:
+            quantization_scales = [None] * len(self.runner_util.output_nodes)
             reference_stage = self.stages[self.stage_name(InitialModel)]
 
         logger.info(
@@ -334,21 +336,26 @@ def run_method_and_compare_outputs(
             input_shape_str = ", ".join([str(list(i)) for i in input_shapes])
             logger.info(f"Run #{run_iteration}, input shapes: {input_shape_str}")
 
-            reference_output = reference_stage.run_artifact(reference_input)
-            if not isinstance(reference_output, tuple):
-                reference_output = (reference_output,)
-            test_output = test_stage.run_artifact(reference_input)
-
-            self._compare_outputs(
-                reference_output,
-                test_output,
-                quantization_scale,
-                atol,
-                rtol,
-                qtol,
-                error_callbacks,
+            reference_outputs, _ = pytree.tree_flatten(
+                reference_stage.run_artifact(reference_input)
+            )
+            test_outputs, _ = pytree.tree_flatten(
+                test_stage.run_artifact(reference_input)
             )
 
+            for reference_output, test_output, quantization_scale in zip(
+                reference_outputs, test_outputs, quantization_scales
+            ):
+                self._compare_outputs(
+                    reference_output,
+                    test_output,
+                    quantization_scale,
+                    atol,
+                    rtol,
+                    qtol,
+                    error_callbacks,
+                )
+
         return self
 
     def get_graph(self, stage: str | None = None) -> Graph:

From cb45fb6ccb1a1b2dd170bc047617cc2e9ff592ab Mon Sep 17 00:00:00 2001
From: Thibaut Goetghebuer-Planchon <thibaut.goetghebuer-planchon@arm.com>
Date: Fri, 17 Jan 2025 08:59:39 +0000
Subject: [PATCH 32/40] Fix uninitialized variable type-check in
 FuseQuantizedActivationPass (#7671)

---
 backends/arm/_passes/fuse_quantized_activation_pass.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
index 86836842bb..4eccea1a14 100644
--- a/backends/arm/_passes/fuse_quantized_activation_pass.py
+++ b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -19,12 +19,13 @@ def _is_fuseable_quantized_activation(self, node: Node):
             is_fuseable = min_val == 0
 
         is_quantized = len(node.users) == 1 and next(iter(node.users)).target == q_op
-        if is_quantized:
+        if is_fuseable and is_quantized:
             quant_node = next(iter(node.users))
             zp = quant_node.args[2]
             qmin = quant_node.args[3]
-
-        return is_fuseable and is_quantized and zp == qmin
+            return zp == qmin
+        else:
+            return False
 
     def _is_fuseable_input(self, node: Node):
         return (

From ffc20208dae8f4900da11bfffb76f749e7514132 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Per=20=C3=85strand?= <per@users.noreply.github.com>
Date: Fri, 17 Jan 2025 11:24:37 +0100
Subject: [PATCH 33/40] Remove unused functions for quantization handling
 (#7700)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove functions not used for searching/finding quantization information.


Signed-off-by: Per Åstrand <per.astrand@arm.com>
---
 .../annotate_channels_last_dim_order_pass.py  |   7 +-
 backends/arm/operators/__init__.py            |   2 -
 backends/arm/operators/op_dequant.py          |  35 ---
 backends/arm/operators/op_hardtanh.py         |   7 +-
 backends/arm/operators/op_quant.py            |  35 ---
 backends/arm/operators/op_relu.py             |   8 +-
 backends/arm/process_node.py                  |  22 +-
 backends/arm/tosa_quant_utils.py              | 270 +-----------------
 backends/arm/tosa_utils.py                    |  28 --
 examples/arm/aot_arm_compiler.py              |   6 +-
 10 files changed, 21 insertions(+), 399 deletions(-)
 delete mode 100644 backends/arm/operators/op_dequant.py
 delete mode 100644 backends/arm/operators/op_quant.py

diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
index 80c5f3c442..4aff46de67 100644
--- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
+++ b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -15,7 +15,7 @@
     get_node_arg,
     insert_q_dq_pair,
 )
-from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, register_passable_op
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -43,9 +43,6 @@ def _transpose_impl(*args, **kwargs):
     return args[0]
 
 
-register_passable_op(torch.ops.passthrough_to_tosa._transpose)
-
-
 class AnnotateChannelsLastDimOrder(ExportPass):
     """
     Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 157e5ec092..a21bde535e 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -13,7 +13,6 @@
     op_bmm,
     op_cat,
     op_conv2d,
-    op_dequant,
     op_exp,
     op_full,
     op_get_item,
@@ -24,7 +23,6 @@
     op_min,
     op_mul,
     op_permute,
-    op_quant,
     op_reciprocal,
     op_relu,
     op_repeat,
diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py
deleted file mode 100644
index 022f4e45ce..0000000000
--- a/backends/arm/operators/op_dequant.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import serializer.tosa_serializer as ts
-import torch
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class DequantVisitor(NodeVisitor):
-    target = "quantized_decomposed.dequantize_per_tensor.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        item_name = inputs[0].name
-        ## Simply add an identityOp
-        tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name])
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
index bfbab55b92..c971b50b66 100644
--- a/backends/arm/operators/op_hardtanh.py
+++ b/backends/arm/operators/op_hardtanh.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -19,7 +19,6 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
-from executorch.backends.arm.tosa_quant_utils import quantize_value
 from serializer.tosa_serializer import TosaOp
 
 
@@ -44,8 +43,8 @@ def define_node(
             input_qparams = get_input_qparams(node)  # pyre-ignore[16]
             qargs = input_qparams[0]
             # Convert to quantized representation
-            clamp_min_qs = quantize_value(inputs[1].number, qargs)
-            clamp_max_qs = quantize_value(inputs[2].number, qargs)
+            clamp_min_qs = qargs.quantize_value(inputs[1].number).item()
+            clamp_max_qs = qargs.quantize_value(inputs[2].number).item()
             # Set fp values to 0.0 since they are not used
             clamp_min_fp = 0.0
             clamp_max_fp = 0.0
diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py
deleted file mode 100644
index fcf9372c11..0000000000
--- a/backends/arm/operators/op_quant.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import List
-
-import serializer.tosa_serializer as ts
-import torch
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.tosa_mapping import TosaArg
-from serializer.tosa_serializer import TosaOp
-
-
-@register_node_visitor
-class QuantVisitor(NodeVisitor):
-    target = "quantized_decomposed.quantize_per_tensor.default"
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: ts.TosaSerializer,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        item_name = inputs[0].name
-        ## Simply add an identityOp
-        tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name])
diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py
index 4df13e71b7..b5ffa2aa70 100644
--- a/backends/arm/operators/op_relu.py
+++ b/backends/arm/operators/op_relu.py
@@ -1,11 +1,10 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
-import executorch.backends.arm.tosa_quant_utils as tqutils
 import serializer.tosa_serializer as ts
 import torch.fx
 
@@ -43,9 +42,8 @@ def define_node(
         clamp_max_qs = 0
         if inputs[0].dtype == ts.DType.INT8:
             out_qargs = get_output_qparams(node)  # pyre-ignore[16]
-            clamp_min_qs = tqutils.quantize_value(0, out_qargs[0])
-            clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs[0])
-
+            clamp_min_qs = out_qargs[0].quantize_value(0).item()
+            clamp_max_qs = out_qargs[0].quantize_value(float("inf")).item()
         else:
             clamp_min_fp = 0
             clamp_max_fp = float("inf")
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index 9ab9c49044..36a1567df9 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -12,12 +12,7 @@
 import torch
 import torch.fx
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
-from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
-from executorch.backends.arm.tosa_quant_utils import (
-    dq_op,
-    get_quantized_node_output_dtype,
-    is_node_quantized,
-)
+from executorch.backends.arm.tosa_mapping import TosaArg
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape
 from torch.export.exported_program import ExportedProgram
@@ -35,15 +30,8 @@ def process_call_function(
     # Convert output (this node itself)
     output = TosaArg(node)
 
-    is_dq_node = node.target == dq_op
-    if is_dq_node:
-        output_dtype = ts.DType.INT8
-    else:
-        output_dtype = output.dtype
     tosa_graph.currRegion.currBasicBlock.addTensor(
-        output.name,
-        tosa_shape(output.shape, output.dim_order),
-        output_dtype,
+        output.name, tosa_shape(output.shape, output.dim_order), output.dtype
     )
 
     # Visiting each Node
@@ -79,11 +67,7 @@ def process_inputs(
     tensor = ts.TosaSerializerTensor(
         inputs[0].name,
         tosa_shape(input_shape, input_dim_order),
-        (
-            map_dtype(get_quantized_node_output_dtype(node))
-            if is_node_quantized(node)
-            else inputs[0].dtype
-        ),
+        inputs[0].dtype,
         data=None,
         placeholderFilename=inputs[0].name + ".npy",
     )
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index dff7b12cdd..9869a08c0b 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,9 +8,7 @@
 # Utiliy functions for TOSA quantized lowerings
 
 import math
-from typing import Callable, cast, NamedTuple, Sequence
-
-import numpy as np
+from typing import cast, NamedTuple
 
 import serializer.tosa_serializer as ts
 import torch.fx
@@ -24,22 +22,6 @@
 q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
 dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
 dq_q_ops = (q_op, dq_op)
-passable_ops = [
-    exir_ops.edge.aten.view_copy.default,
-    exir_ops.edge.aten.permute_copy.default,
-    exir_ops.edge.aten.squeeze_copy.dims,
-    exir_ops.edge.aten.unsqueeze_copy.default,
-    exir_ops.edge.aten.split_with_sizes_copy.default,
-    exir_ops.edge.aten.repeat.default,
-    exir_ops.edge.aten.clone.default,
-    exir_ops.edge.aten.slice_copy.Tensor,
-    exir_ops.edge.aten.cat.default,
-]
-
-
-def register_passable_op(op):
-    """We need to be able to add custom ops such as tosa_transpose to the passable_op list after they have been created"""
-    passable_ops.append(op)
 
 
 def insert_rescale_ops_to_int32(
@@ -53,8 +35,7 @@ def insert_rescale_ops_to_int32(
 
     This functions is used in serialization to TOSA for target ops that are
     handled by the DQ/D folding pass, which stores the quantization parameters
-    in the node meta dict as opposed to 'rescale_nodes_to_int32' which search
-    the graph upstream for DQ nodes.
+    in the node meta dict.
     """
 
     # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
@@ -100,13 +81,12 @@ def insert_rescale_op_to_int8(
     Parameters:
         node: The original node that is being handled by the rescales.
         last_tensor:the tosa tensor to rescale back.
-        scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32'
+        scale: the scaling factor used to rescale to int32, from the function 'insert_rescale_op_to_int32'
         tosa_graph: the tosa_graph to manipulate.
 
     This functions is used in serialization to TOSA for target ops that are
     handled by the DQ/D folding pass, which stores the quantization parameters
-    in the node meta dict as opposed to 'rescale_node_back_to_int8' which search
-    the graph downstream for Q nodes.
+    in the node meta dict.
     """
     # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
     from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
@@ -148,17 +128,6 @@ def quantize_value(self, x):
     def dequantize_value(self, qx: torch.Tensor) -> torch.Tensor:
         return (qx - self.zp) * self.scale
 
-    def __eq__(self, other):
-        if isinstance(other, QuantArgs):
-            return (
-                self.scale == other.scale
-                and self.zp == other.zp
-                and self.qmin == other.qmin
-                and self.qmax == other.qmax
-                and self.dtype == other.dtype
-            )
-        return False
-
     @classmethod
     def from_operator(cls, op, args):
         if op in dq_q_ops:
@@ -174,172 +143,6 @@ def from_operator(cls, op, args):
             raise NotImplementedError
 
 
-def quantize_value(x, qargs: QuantArgs, dtype=np.int8):
-    return np.clip(
-        np.round(x / qargs.scale) + qargs.zp,
-        qargs.qmin,
-        qargs.qmax,
-    ).astype(dtype)
-
-
-def dequantize_value(qx, qargs: QuantArgs):
-    return (np.int64(qx) - qargs.zp) * qargs.scale
-
-
-def qargs_from_qnode(node: torch.fx.Node):
-    assert node.target in dq_q_ops, f"Op {node} is not a quant node."
-
-    return QuantArgs.from_operator(node.target, node.args)
-
-
-def get_neighbour_quant_args(
-    node: torch.fx.Node,
-) -> tuple[list[QuantArgs], list[QuantArgs]]:
-    user_q_args = []
-
-    for user in node.users:
-        q_args = search_quant_arg_downstream(user)
-        if q_args:
-            user_q_args.append(q_args)
-
-    input_q_nodes = []
-    for input_node in node.all_input_nodes:
-        q_args = search_quant_arg_upstream(input_node)
-        if q_args:
-            input_q_nodes.append(q_args)
-    return user_q_args, input_q_nodes
-
-
-def all_q_args_equal(q_arg_list: list[QuantArgs]) -> bool:
-    first_q_arg = q_arg_list[0]
-    for q_arg in q_arg_list:
-        if q_arg != first_q_arg:
-            return False
-    return True
-
-
-def is_node_quantized(node: torch.fx.Node) -> bool:
-    if node.target in dq_q_ops:
-        return True
-
-    user_q_args, input_q_args = get_neighbour_quant_args(node)
-
-    # If we did not find any neighbouring quant nodes, we are not quantized.
-    if len(input_q_args) == 0 and len(user_q_args) == 0:
-        return False
-
-    if node.target in passable_ops:
-        assert all_q_args_equal(
-            user_q_args + input_q_args
-        ), f"Node {node} needs same quantization parameters on all inputs and outputs."
-
-    return True
-
-
-def search_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs | None:
-    """
-    Iterates downward in the graph passing through 'passable_ops' to find and return a quantization node,
-    starting with 'node'.
-    If a  passable node with multiple consumers is encountered,
-    find QuantArgs for all consumers and assert that they are equal.
-    If a node not in passable_ops is encountered, return None.
-    If a node without consumers is encountered, return None.
-    """
-    if node.target in dq_q_ops:
-        return qargs_from_qnode(node)
-    if node.target not in passable_ops:
-        return None
-    consumer_nodes = list(node.users)
-    if len(consumer_nodes) == 0:
-        return None
-    elif len(consumer_nodes) == 1:
-        return search_quant_arg_downstream(consumer_nodes[0])
-    else:
-        consumer_qargs: list[QuantArgs] = []
-        for input in consumer_nodes:
-            quant_args = search_quant_arg_downstream(input)
-            if quant_args:
-                consumer_qargs.append(quant_args)
-        if len(consumer_qargs) == 0:
-            return None
-        assert all_q_args_equal(
-            consumer_qargs
-        ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different consumers."
-        return consumer_qargs[0]
-
-
-def get_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs:
-    """Calls search_quant_arg_downstream and asserts that QuantArgs are found,
-    meaning return value can't be None.
-    """
-    qargs = search_quant_arg_downstream(node)
-    assert qargs, f"Did not find QuantArgs downstream for node {node}"
-    return qargs
-
-
-def search_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs | None:
-    """
-    Iterates upward in the graph passing through 'passable_ops' to find and return a quantization node,
-    starting with 'node'.
-    If a  passable node with multiple inputs is encountered,
-    find QuantArgs for all inputs and assert that they are equal.
-    If a node not in passable_ops is encountered, return None.
-    If a node without inputs is encountered, return None.
-    """
-
-    if node.target in dq_q_ops:
-        return qargs_from_qnode(node)
-    if node.target not in passable_ops:
-        return None
-    input_nodes = list(node.all_input_nodes)
-    if len(input_nodes) == 0:
-        return None
-    elif len(input_nodes) == 1:
-        return search_quant_arg_upstream(input_nodes[0])
-    else:
-        input_qargs: list[QuantArgs] = []
-        for input in input_nodes:
-            quant_args = search_quant_arg_upstream(input)
-            if quant_args:
-                input_qargs.append(quant_args)
-        if len(input_qargs) == 0:
-            return None
-        assert all_q_args_equal(
-            input_qargs
-        ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different inputs."
-        return input_qargs[0]
-
-
-def get_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs:
-    """Calls search_quant_arg_upstream and asserts that QuantArgs are found,
-    meaning return value can't be None.
-    """
-    qargs = search_quant_arg_upstream(node)
-    assert qargs, f"Did not find QuantArgs upstream for node {node}"
-    return qargs
-
-
-def get_quantized_node_output_dtype(node: torch.fx.Node) -> torch.dtype:
-    if isinstance(node.target, Callable) and "output_qparams" in node.meta.keys():
-        # Check if the node has had it's quantization parameters folded
-        # and retrieve the dtype from the meta dict in that case.
-        assert len(node.meta["output_qparams"]) == 1
-        qargs = cast(QuantArgs, node.meta["output_qparams"][0])
-        return qargs.dtype
-
-    if node.target in dq_q_ops:
-        return cast(torch.dtype, node.args[5])
-
-    # if not a tosa node, nor a q/dq op, walk the graph until we find a q op
-    user_q_args, input_q_args = get_neighbour_quant_args(node)
-    if len(user_q_args) > 0:
-        return user_q_args[0].dtype
-    elif node.target in passable_ops and len(input_q_args) > 0:
-        return input_q_args[0].dtype
-    else:
-        raise RuntimeError("No quantized node found in graph")
-
-
 # Check if scale32 mode is used for given output element type
 def is_scale32(type):
     return type == ts.DType.INT8
@@ -476,69 +279,6 @@ def build_rescale_from_int32(
     return
 
 
-def rescale_nodes_to_int32(
-    nodes: Sequence[Node], tosa_graph: ts.TosaSerializer
-) -> tuple[list[TosaSerializerTensor], float]:
-    """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'.
-    The scales are adjusted using the smallest scale of all 'nodes'.
-
-    Returns a list of the rescaled nodes and the scale factor used,
-    needed by rescale_node_back_to_int8.
-    """
-
-    tensors = [TosaArg(node) for node in nodes]
-
-    # Reshape tensor according to tosa dim order
-    for tensor in tensors:
-        dim_order = tensor.dim_order
-        tensor.shape = [tensor.shape[i] for i in dim_order]
-
-    qargs = [get_quant_arg_upstream(node) for node in nodes]
-
-    # Scale the int8 quantized input to a common scale in the integer
-    # domain
-    min_scale = min([qarg.scale for qarg in qargs])
-    scales = [qarg.scale / min_scale for qarg in qargs]
-
-    rescaled_nodes: list[TosaSerializerTensor] = []
-    for tensor, qarg, scale in zip(tensors, qargs, scales):
-        rescaled_nodes.append(
-            build_rescale_to_int32(
-                tosa_graph,
-                tensor,
-                qarg.zp,
-                scale,
-            )
-        )
-    return rescaled_nodes, min_scale
-
-
-def rescale_node_back_to_int8(
-    node: Node,
-    last_tensor: TosaSerializerTensor,
-    scale: float,
-    tosa_graph: ts.TosaSerializer,
-):
-    """Rescales the node back to int8, adding a suitable RESCALE op to 'tosa_graph'.
-    Parameters:
-        node: The original node that is being handled by the rescales.
-        last_tensor:the tosa tensor to rescale back.
-        scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32'
-        tosa_graph: the tosa_graph to manipulate.
-    """
-    qargs_out = get_quant_arg_downstream(list(node.users)[0])
-    output_rescale_scale = scale / qargs_out.scale
-
-    # Rescale Back to INT8
-    build_rescale_from_int32(
-        tosa_graph,
-        last_tensor.name,
-        node.name,
-        qargs_out.zp,
-        output_rescale_scale,
-    )
-
-
 """ Creates a TOSA rescale op based on conv2d parameters. """
 
 
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index c03e0ef0bb..9fefdbb3ff 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -115,10 +115,6 @@ def getNodeArgs(node: Node) -> list[TosaArg]:
     return [TosaArg(arg) for arg in node.args]
 
 
-def get_input_tensor(node: Node) -> TosaArg:
-    return TosaArg(node.args[0])
-
-
 def get_output_node(node: Node) -> Node:
     return list(node.users)[0]
 
@@ -146,30 +142,6 @@ def is_consumer_node_depthwise_conv2d(node):
     return False
 
 
-def get_two_inputs(node: Node, check: bool = False) -> tuple[Node, Node]:
-    """Returns two input nodes to 'node' in order. If 'node' only has one input,
-    it is returned twice.
-
-    Fails if there are no input nodes.
-    Fails if there are >2 input nodes and 'check' is True,
-    """
-
-    num_inputs = len(node.all_input_nodes)
-    assert num_inputs > 0, f"Node '{node.name}' requires >0 input, got {num_inputs}."
-
-    input1 = node.all_input_nodes[0]
-    if num_inputs == 1:
-        input2 = node.all_input_nodes[0]
-    else:
-        input2 = node.all_input_nodes[1]
-    if check:
-        assert (
-            num_inputs <= 2
-        ), f"Node '{node.name}' requires <=2 inputs, got {num_inputs}."
-
-    return input1, input2
-
-
 def tosa_shape(shape, dim_order):
     return tuple([shape[dim] for dim in dim_order])
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 9563be93aa..a49436193b 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -264,7 +264,11 @@ def get_compile_spec(
 ) -> list[CompileSpec]:
     spec_builder = None
     if target == "TOSA":
-        spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI")
+        spec_builder = (
+            ArmCompileSpecBuilder()
+            .tosa_compile_spec("TOSA-0.80+BI")
+            .set_quantize_io(True)
+        )
     elif "ethos-u55" in target:
         spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
             target,

From eaad7ff1ece5524b8892be9a3c40a3636ec2b64f Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Fri, 17 Jan 2025 11:36:03 +0100
Subject: [PATCH 34/40] Revert "Remove unused functions for quantization
 handling" (#7724)

Revert "Remove unused functions for quantization handling (#7700)"

This reverts commit ffc20208dae8f4900da11bfffb76f749e7514132.
---
 .../annotate_channels_last_dim_order_pass.py  |   7 +-
 backends/arm/operators/__init__.py            |   2 +
 backends/arm/operators/op_dequant.py          |  35 +++
 backends/arm/operators/op_hardtanh.py         |   7 +-
 backends/arm/operators/op_quant.py            |  35 +++
 backends/arm/operators/op_relu.py             |   8 +-
 backends/arm/process_node.py                  |  22 +-
 backends/arm/tosa_quant_utils.py              | 270 +++++++++++++++++-
 backends/arm/tosa_utils.py                    |  28 ++
 examples/arm/aot_arm_compiler.py              |   6 +-
 10 files changed, 399 insertions(+), 21 deletions(-)
 create mode 100644 backends/arm/operators/op_dequant.py
 create mode 100644 backends/arm/operators/op_quant.py

diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
index 4aff46de67..80c5f3c442 100644
--- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
+++ b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -15,7 +15,7 @@
     get_node_arg,
     insert_q_dq_pair,
 )
-from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, register_passable_op
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -43,6 +43,9 @@ def _transpose_impl(*args, **kwargs):
     return args[0]
 
 
+register_passable_op(torch.ops.passthrough_to_tosa._transpose)
+
+
 class AnnotateChannelsLastDimOrder(ExportPass):
     """
     Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index a21bde535e..157e5ec092 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -13,6 +13,7 @@
     op_bmm,
     op_cat,
     op_conv2d,
+    op_dequant,
     op_exp,
     op_full,
     op_get_item,
@@ -23,6 +24,7 @@
     op_min,
     op_mul,
     op_permute,
+    op_quant,
     op_reciprocal,
     op_relu,
     op_repeat,
diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py
new file mode 100644
index 0000000000..022f4e45ce
--- /dev/null
+++ b/backends/arm/operators/op_dequant.py
@@ -0,0 +1,35 @@
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class DequantVisitor(NodeVisitor):
+    target = "quantized_decomposed.dequantize_per_tensor.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        item_name = inputs[0].name
+        ## Simply add an identityOp
+        tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name])
diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py
index c971b50b66..bfbab55b92 100644
--- a/backends/arm/operators/op_hardtanh.py
+++ b/backends/arm/operators/op_hardtanh.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2025 Arm Limited and/or its affiliates.
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -19,6 +19,7 @@
 )
 from executorch.backends.arm.tosa_mapping import TosaArg
 
+from executorch.backends.arm.tosa_quant_utils import quantize_value
 from serializer.tosa_serializer import TosaOp
 
 
@@ -43,8 +44,8 @@ def define_node(
             input_qparams = get_input_qparams(node)  # pyre-ignore[16]
             qargs = input_qparams[0]
             # Convert to quantized representation
-            clamp_min_qs = qargs.quantize_value(inputs[1].number).item()
-            clamp_max_qs = qargs.quantize_value(inputs[2].number).item()
+            clamp_min_qs = quantize_value(inputs[1].number, qargs)
+            clamp_max_qs = quantize_value(inputs[2].number, qargs)
             # Set fp values to 0.0 since they are not used
             clamp_min_fp = 0.0
             clamp_max_fp = 0.0
diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py
new file mode 100644
index 0000000000..fcf9372c11
--- /dev/null
+++ b/backends/arm/operators/op_quant.py
@@ -0,0 +1,35 @@
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class QuantVisitor(NodeVisitor):
+    target = "quantized_decomposed.quantize_per_tensor.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        item_name = inputs[0].name
+        ## Simply add an identityOp
+        tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name])
diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py
index b5ffa2aa70..4df13e71b7 100644
--- a/backends/arm/operators/op_relu.py
+++ b/backends/arm/operators/op_relu.py
@@ -1,10 +1,11 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
+import executorch.backends.arm.tosa_quant_utils as tqutils
 import serializer.tosa_serializer as ts
 import torch.fx
 
@@ -42,8 +43,9 @@ def define_node(
         clamp_max_qs = 0
         if inputs[0].dtype == ts.DType.INT8:
             out_qargs = get_output_qparams(node)  # pyre-ignore[16]
-            clamp_min_qs = out_qargs[0].quantize_value(0).item()
-            clamp_max_qs = out_qargs[0].quantize_value(float("inf")).item()
+            clamp_min_qs = tqutils.quantize_value(0, out_qargs[0])
+            clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs[0])
+
         else:
             clamp_min_fp = 0
             clamp_max_fp = float("inf")
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index 36a1567df9..9ab9c49044 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -12,7 +12,12 @@
 import torch
 import torch.fx
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
-from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
+from executorch.backends.arm.tosa_quant_utils import (
+    dq_op,
+    get_quantized_node_output_dtype,
+    is_node_quantized,
+)
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape
 from torch.export.exported_program import ExportedProgram
@@ -30,8 +35,15 @@ def process_call_function(
     # Convert output (this node itself)
     output = TosaArg(node)
 
+    is_dq_node = node.target == dq_op
+    if is_dq_node:
+        output_dtype = ts.DType.INT8
+    else:
+        output_dtype = output.dtype
     tosa_graph.currRegion.currBasicBlock.addTensor(
-        output.name, tosa_shape(output.shape, output.dim_order), output.dtype
+        output.name,
+        tosa_shape(output.shape, output.dim_order),
+        output_dtype,
     )
 
     # Visiting each Node
@@ -67,7 +79,11 @@ def process_inputs(
     tensor = ts.TosaSerializerTensor(
         inputs[0].name,
         tosa_shape(input_shape, input_dim_order),
-        inputs[0].dtype,
+        (
+            map_dtype(get_quantized_node_output_dtype(node))
+            if is_node_quantized(node)
+            else inputs[0].dtype
+        ),
         data=None,
         placeholderFilename=inputs[0].name + ".npy",
     )
diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py
index 9869a08c0b..dff7b12cdd 100644
--- a/backends/arm/tosa_quant_utils.py
+++ b/backends/arm/tosa_quant_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2025 Arm Limited and/or its affiliates.
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,7 +8,9 @@
 # Utiliy functions for TOSA quantized lowerings
 
 import math
-from typing import cast, NamedTuple
+from typing import Callable, cast, NamedTuple, Sequence
+
+import numpy as np
 
 import serializer.tosa_serializer as ts
 import torch.fx
@@ -22,6 +24,22 @@
 q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
 dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
 dq_q_ops = (q_op, dq_op)
+passable_ops = [
+    exir_ops.edge.aten.view_copy.default,
+    exir_ops.edge.aten.permute_copy.default,
+    exir_ops.edge.aten.squeeze_copy.dims,
+    exir_ops.edge.aten.unsqueeze_copy.default,
+    exir_ops.edge.aten.split_with_sizes_copy.default,
+    exir_ops.edge.aten.repeat.default,
+    exir_ops.edge.aten.clone.default,
+    exir_ops.edge.aten.slice_copy.Tensor,
+    exir_ops.edge.aten.cat.default,
+]
+
+
+def register_passable_op(op):
+    """We need to be able to add custom ops such as tosa_transpose to the passable_op list after they have been created"""
+    passable_ops.append(op)
 
 
 def insert_rescale_ops_to_int32(
@@ -35,7 +53,8 @@ def insert_rescale_ops_to_int32(
 
     This functions is used in serialization to TOSA for target ops that are
     handled by the DQ/D folding pass, which stores the quantization parameters
-    in the node meta dict.
+    in the node meta dict as opposed to 'rescale_nodes_to_int32' which search
+    the graph upstream for DQ nodes.
     """
 
     # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
@@ -81,12 +100,13 @@ def insert_rescale_op_to_int8(
     Parameters:
         node: The original node that is being handled by the rescales.
         last_tensor:the tosa tensor to rescale back.
-        scale: the scaling factor used to rescale to int32, from the function 'insert_rescale_op_to_int32'
+        scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32'
         tosa_graph: the tosa_graph to manipulate.
 
     This functions is used in serialization to TOSA for target ops that are
     handled by the DQ/D folding pass, which stores the quantization parameters
-    in the node meta dict.
+    in the node meta dict as opposed to 'rescale_node_back_to_int8' which search
+    the graph downstream for Q nodes.
     """
     # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.'
     from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
@@ -128,6 +148,17 @@ def quantize_value(self, x):
     def dequantize_value(self, qx: torch.Tensor) -> torch.Tensor:
         return (qx - self.zp) * self.scale
 
+    def __eq__(self, other):
+        if isinstance(other, QuantArgs):
+            return (
+                self.scale == other.scale
+                and self.zp == other.zp
+                and self.qmin == other.qmin
+                and self.qmax == other.qmax
+                and self.dtype == other.dtype
+            )
+        return False
+
     @classmethod
     def from_operator(cls, op, args):
         if op in dq_q_ops:
@@ -143,6 +174,172 @@ def from_operator(cls, op, args):
             raise NotImplementedError
 
 
+def quantize_value(x, qargs: QuantArgs, dtype=np.int8):
+    return np.clip(
+        np.round(x / qargs.scale) + qargs.zp,
+        qargs.qmin,
+        qargs.qmax,
+    ).astype(dtype)
+
+
+def dequantize_value(qx, qargs: QuantArgs):
+    return (np.int64(qx) - qargs.zp) * qargs.scale
+
+
+def qargs_from_qnode(node: torch.fx.Node):
+    assert node.target in dq_q_ops, f"Op {node} is not a quant node."
+
+    return QuantArgs.from_operator(node.target, node.args)
+
+
+def get_neighbour_quant_args(
+    node: torch.fx.Node,
+) -> tuple[list[QuantArgs], list[QuantArgs]]:
+    user_q_args = []
+
+    for user in node.users:
+        q_args = search_quant_arg_downstream(user)
+        if q_args:
+            user_q_args.append(q_args)
+
+    input_q_nodes = []
+    for input_node in node.all_input_nodes:
+        q_args = search_quant_arg_upstream(input_node)
+        if q_args:
+            input_q_nodes.append(q_args)
+    return user_q_args, input_q_nodes
+
+
+def all_q_args_equal(q_arg_list: list[QuantArgs]) -> bool:
+    first_q_arg = q_arg_list[0]
+    for q_arg in q_arg_list:
+        if q_arg != first_q_arg:
+            return False
+    return True
+
+
+def is_node_quantized(node: torch.fx.Node) -> bool:
+    if node.target in dq_q_ops:
+        return True
+
+    user_q_args, input_q_args = get_neighbour_quant_args(node)
+
+    # If we did not find any neighbouring quant nodes, we are not quantized.
+    if len(input_q_args) == 0 and len(user_q_args) == 0:
+        return False
+
+    if node.target in passable_ops:
+        assert all_q_args_equal(
+            user_q_args + input_q_args
+        ), f"Node {node} needs same quantization parameters on all inputs and outputs."
+
+    return True
+
+
+def search_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs | None:
+    """
+    Iterates downward in the graph passing through 'passable_ops' to find and return a quantization node,
+    starting with 'node'.
+    If a  passable node with multiple consumers is encountered,
+    find QuantArgs for all consumers and assert that they are equal.
+    If a node not in passable_ops is encountered, return None.
+    If a node without consumers is encountered, return None.
+    """
+    if node.target in dq_q_ops:
+        return qargs_from_qnode(node)
+    if node.target not in passable_ops:
+        return None
+    consumer_nodes = list(node.users)
+    if len(consumer_nodes) == 0:
+        return None
+    elif len(consumer_nodes) == 1:
+        return search_quant_arg_downstream(consumer_nodes[0])
+    else:
+        consumer_qargs: list[QuantArgs] = []
+        for input in consumer_nodes:
+            quant_args = search_quant_arg_downstream(input)
+            if quant_args:
+                consumer_qargs.append(quant_args)
+        if len(consumer_qargs) == 0:
+            return None
+        assert all_q_args_equal(
+            consumer_qargs
+        ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different consumers."
+        return consumer_qargs[0]
+
+
+def get_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs:
+    """Calls search_quant_arg_downstream and asserts that QuantArgs are found,
+    meaning return value can't be None.
+    """
+    qargs = search_quant_arg_downstream(node)
+    assert qargs, f"Did not find QuantArgs downstream for node {node}"
+    return qargs
+
+
+def search_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs | None:
+    """
+    Iterates upward in the graph passing through 'passable_ops' to find and return a quantization node,
+    starting with 'node'.
+    If a  passable node with multiple inputs is encountered,
+    find QuantArgs for all inputs and assert that they are equal.
+    If a node not in passable_ops is encountered, return None.
+    If a node without inputs is encountered, return None.
+    """
+
+    if node.target in dq_q_ops:
+        return qargs_from_qnode(node)
+    if node.target not in passable_ops:
+        return None
+    input_nodes = list(node.all_input_nodes)
+    if len(input_nodes) == 0:
+        return None
+    elif len(input_nodes) == 1:
+        return search_quant_arg_upstream(input_nodes[0])
+    else:
+        input_qargs: list[QuantArgs] = []
+        for input in input_nodes:
+            quant_args = search_quant_arg_upstream(input)
+            if quant_args:
+                input_qargs.append(quant_args)
+        if len(input_qargs) == 0:
+            return None
+        assert all_q_args_equal(
+            input_qargs
+        ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different inputs."
+        return input_qargs[0]
+
+
+def get_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs:
+    """Calls search_quant_arg_upstream and asserts that QuantArgs are found,
+    meaning return value can't be None.
+    """
+    qargs = search_quant_arg_upstream(node)
+    assert qargs, f"Did not find QuantArgs upstream for node {node}"
+    return qargs
+
+
+def get_quantized_node_output_dtype(node: torch.fx.Node) -> torch.dtype:
+    if isinstance(node.target, Callable) and "output_qparams" in node.meta.keys():
+        # Check if the node has had it's quantization parameters folded
+        # and retrieve the dtype from the meta dict in that case.
+        assert len(node.meta["output_qparams"]) == 1
+        qargs = cast(QuantArgs, node.meta["output_qparams"][0])
+        return qargs.dtype
+
+    if node.target in dq_q_ops:
+        return cast(torch.dtype, node.args[5])
+
+    # if not a tosa node, nor a q/dq op, walk the graph until we find a q op
+    user_q_args, input_q_args = get_neighbour_quant_args(node)
+    if len(user_q_args) > 0:
+        return user_q_args[0].dtype
+    elif node.target in passable_ops and len(input_q_args) > 0:
+        return input_q_args[0].dtype
+    else:
+        raise RuntimeError("No quantized node found in graph")
+
+
 # Check if scale32 mode is used for given output element type
 def is_scale32(type):
     return type == ts.DType.INT8
@@ -279,6 +476,69 @@ def build_rescale_from_int32(
     return
 
 
+def rescale_nodes_to_int32(
+    nodes: Sequence[Node], tosa_graph: ts.TosaSerializer
+) -> tuple[list[TosaSerializerTensor], float]:
+    """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'.
+    The scales are adjusted using the smallest scale of all 'nodes'.
+
+    Returns a list of the rescaled nodes and the scale factor used,
+    needed by rescale_node_back_to_int8.
+    """
+
+    tensors = [TosaArg(node) for node in nodes]
+
+    # Reshape tensor according to tosa dim order
+    for tensor in tensors:
+        dim_order = tensor.dim_order
+        tensor.shape = [tensor.shape[i] for i in dim_order]
+
+    qargs = [get_quant_arg_upstream(node) for node in nodes]
+
+    # Scale the int8 quantized input to a common scale in the integer
+    # domain
+    min_scale = min([qarg.scale for qarg in qargs])
+    scales = [qarg.scale / min_scale for qarg in qargs]
+
+    rescaled_nodes: list[TosaSerializerTensor] = []
+    for tensor, qarg, scale in zip(tensors, qargs, scales):
+        rescaled_nodes.append(
+            build_rescale_to_int32(
+                tosa_graph,
+                tensor,
+                qarg.zp,
+                scale,
+            )
+        )
+    return rescaled_nodes, min_scale
+
+
+def rescale_node_back_to_int8(
+    node: Node,
+    last_tensor: TosaSerializerTensor,
+    scale: float,
+    tosa_graph: ts.TosaSerializer,
+):
+    """Rescales the node back to int8, adding a suitable RESCALE op to 'tosa_graph'.
+    Parameters:
+        node: The original node that is being handled by the rescales.
+        last_tensor:the tosa tensor to rescale back.
+        scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32'
+        tosa_graph: the tosa_graph to manipulate.
+    """
+    qargs_out = get_quant_arg_downstream(list(node.users)[0])
+    output_rescale_scale = scale / qargs_out.scale
+
+    # Rescale Back to INT8
+    build_rescale_from_int32(
+        tosa_graph,
+        last_tensor.name,
+        node.name,
+        qargs_out.zp,
+        output_rescale_scale,
+    )
+
+
 """ Creates a TOSA rescale op based on conv2d parameters. """
 
 
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
index 9fefdbb3ff..c03e0ef0bb 100644
--- a/backends/arm/tosa_utils.py
+++ b/backends/arm/tosa_utils.py
@@ -115,6 +115,10 @@ def getNodeArgs(node: Node) -> list[TosaArg]:
     return [TosaArg(arg) for arg in node.args]
 
 
+def get_input_tensor(node: Node) -> TosaArg:
+    return TosaArg(node.args[0])
+
+
 def get_output_node(node: Node) -> Node:
     return list(node.users)[0]
 
@@ -142,6 +146,30 @@ def is_consumer_node_depthwise_conv2d(node):
     return False
 
 
+def get_two_inputs(node: Node, check: bool = False) -> tuple[Node, Node]:
+    """Returns two input nodes to 'node' in order. If 'node' only has one input,
+    it is returned twice.
+
+    Fails if there are no input nodes.
+    Fails if there are >2 input nodes and 'check' is True,
+    """
+
+    num_inputs = len(node.all_input_nodes)
+    assert num_inputs > 0, f"Node '{node.name}' requires >0 input, got {num_inputs}."
+
+    input1 = node.all_input_nodes[0]
+    if num_inputs == 1:
+        input2 = node.all_input_nodes[0]
+    else:
+        input2 = node.all_input_nodes[1]
+    if check:
+        assert (
+            num_inputs <= 2
+        ), f"Node '{node.name}' requires <=2 inputs, got {num_inputs}."
+
+    return input1, input2
+
+
 def tosa_shape(shape, dim_order):
     return tuple([shape[dim] for dim in dim_order])
 
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index a49436193b..9563be93aa 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -264,11 +264,7 @@ def get_compile_spec(
 ) -> list[CompileSpec]:
     spec_builder = None
     if target == "TOSA":
-        spec_builder = (
-            ArmCompileSpecBuilder()
-            .tosa_compile_spec("TOSA-0.80+BI")
-            .set_quantize_io(True)
-        )
+        spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI")
     elif "ethos-u55" in target:
         spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
             target,

From 5b9ab56657dabda161e866d4a574172f974b20c8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 17 Jan 2025 09:16:12 -0800
Subject: [PATCH 35/40] install_requirements.py: reorganize requirements
 (#7705)

Duplicate requirements with the pyproject.toml > /dev/null
One unique devel reqiurement > requirements-dev.txt
Examples requirements > requirements-examples.txt
Nightlies stayed in the script.

Rationale: be as "normal" a Python project as seemed possible.

Test Plan: install_requirements.sh in a clean venv succeeded
---
 install_requirements.py   | 25 +++++--------------------
 requirements-examples.txt |  5 +++++
 2 files changed, 10 insertions(+), 20 deletions(-)
 create mode 100644 requirements-examples.txt

diff --git a/install_requirements.py b/install_requirements.py
index 409460ca10..52ba89edd7 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -104,34 +104,15 @@ def install_requirements(use_pytorch_nightly):
             if use_pytorch_nightly
             else "torchvision"
         ),  # For testing.
-        "typing-extensions",
     ]
 
-    # pip packages needed to run examples.
-    # TODO: Make each example publish its own requirements.txt
     EXAMPLES_REQUIREMENTS = [
-        "timm==1.0.7",
         f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio",
-        "torchsr==1.0.4",
-        "transformers==4.47.1",
-    ]
-
-    # pip packages needed for development.
-    DEVEL_REQUIREMENTS = [
-        "cmake",  # For building binary targets.
-        "pip>=23",  # For building the pip package.
-        "pyyaml",  # Imported by the kernel codegen tools.
-        "setuptools>=63",  # For building the pip package.
-        "tomli",  # Imported by extract_sources.py when using python < 3.11.
-        "wheel",  # For building the pip package archive.
-        "zstd",  # Imported by resolve_buck.py.
     ]
 
     # Assemble the list of requirements to actually install.
     # TODO: Add options for reducing the number of requirements.
-    REQUIREMENTS_TO_INSTALL = (
-        EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS
-    )
+    REQUIREMENTS_TO_INSTALL = EXIR_REQUIREMENTS + EXAMPLES_REQUIREMENTS
 
     # Install the requirements. `--extra-index-url` tells pip to look for package
     # versions on the provided URL if they aren't available on the default URL.
@@ -141,6 +122,8 @@ def install_requirements(use_pytorch_nightly):
             "-m",
             "pip",
             "install",
+            "-r",
+            "requirements-examples.txt",
             *REQUIREMENTS_TO_INSTALL,
             "--extra-index-url",
             TORCH_NIGHTLY_URL,
@@ -160,6 +143,8 @@ def install_requirements(use_pytorch_nightly):
             "-m",
             "pip",
             "install",
+            # Without --no-build-isolation, setup.py can't find the torch module.
+            "--no-build-isolation",
             *LOCAL_REQUIREMENTS,
         ],
         check=True,
diff --git a/requirements-examples.txt b/requirements-examples.txt
new file mode 100644
index 0000000000..d4126a178a
--- /dev/null
+++ b/requirements-examples.txt
@@ -0,0 +1,5 @@
+# pip packages needed to run examples.
+# TODO: Make each example publish its own requirements.txt
+timm == 1.0.7
+torchsr == 1.0.4
+transformers ==4.47.1

From 5dfbf478958577b96ccd14cdae235ed35cda27b1 Mon Sep 17 00:00:00 2001
From: JP <46308822+zonglinpeng@users.noreply.github.com>
Date: Fri, 17 Jan 2025 10:04:08 -0800
Subject: [PATCH 36/40] fix g3 dequant

Differential Revision: D68109702

Pull Request resolved: https://github.com/pytorch/executorch/pull/7683
---
 .../cadence/fusion_g3/operators/op_dequantize.cpp     | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
index cff50f2a90..3e0235170b 100644
--- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp
+++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -67,8 +67,8 @@ void check_dequantize_per_tensor_args(
 
   ET_CHECK_MSG(
       input.scalar_type() == dtype,
-      "input.scalar_type() %" PRId8 " is not matching dtype argumenta:",
-      static_cast<int8_t>(input.scalar_type()));
+      "input.scalar_type() %s is not matching dtype arguments:",
+      ::executorch::runtime::toString(input.scalar_type()));
 
   if (out_dtype.has_value()) {
     ET_CHECK_MSG(
@@ -561,11 +561,12 @@ Tensor& dequantize_per_tensor_out(
     const Tensor& input,
     double scale,
     int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
+    __ET_UNUSED int64_t quant_min,
+    __ET_UNUSED int64_t quant_max,
     ScalarType dtype,
-    ::executorch::aten::optional<ScalarType> out_dtype,
     Tensor& out) {
+  constexpr ScalarType out_dtype = ScalarType::Float;
+
 #ifdef OP_ARG_CHECK
   torch::executor::Error err = resize_tensor(out, input.sizes());
   ET_CHECK_MSG(

From ce77ee7c4363d6c370c5e52da2b85f67f70943d1 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 17 Jan 2025 10:30:36 -0800
Subject: [PATCH 37/40] Fix linux_job_v2 after
 https://github.com/pytorch/test-infra/pull/6104 (#7731)

---
 .github/workflows/pull.yml  | 3 +++
 .github/workflows/trunk.yml | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 8b32e46cf2..b629a52e72 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -333,6 +333,9 @@ jobs:
 
   unittest-arm:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 90bd0eb6ef..0cbbe6f643 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -132,6 +132,9 @@ jobs:
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -159,6 +162,9 @@ jobs:
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk

From a8c46d1e43b6d4691efba1b003666af2e8f578f5 Mon Sep 17 00:00:00 2001
From: JP <46308822+zonglinpeng@users.noreply.github.com>
Date: Fri, 17 Jan 2025 10:49:46 -0800
Subject: [PATCH 38/40] migrate facto utils to OSS

Differential Revision: D68195666

Pull Request resolved: https://github.com/pytorch/executorch/pull/7686
---
 examples/cadence/operators/facto_util.py | 91 ++++++++++++++++++++++++
 examples/cadence/operators/targets.bzl   | 14 ++++
 2 files changed, 105 insertions(+)
 create mode 100644 examples/cadence/operators/facto_util.py

diff --git a/examples/cadence/operators/facto_util.py b/examples/cadence/operators/facto_util.py
new file mode 100644
index 0000000000..e9b16f8bf6
--- /dev/null
+++ b/examples/cadence/operators/facto_util.py
@@ -0,0 +1,91 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+import copy
+from typing import List, OrderedDict, Tuple
+
+import torch
+from inputgen.argtuple.gen import ArgumentTupleGenerator
+from inputgen.specs.model import ConstraintProducer as cp
+from inputgen.utils.random_manager import random_manager
+from inputgen.variable.type import ScalarDtype
+from specdb.db import SpecDictDB
+
+# seed to generate identical cases every run to reproduce from bisect
+random_manager.seed(1729)
+
+
+def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> None:
+    match op_name:
+        case (
+            "sigmoid.default"
+            | "_softmax.default"
+            | "rsqrt.default"
+            | "exp.default"
+            | "mul.Tensor"
+            | "div.Tensor"
+        ):
+            tensor_constraints.append(
+                cp.Dtype.In(lambda deps: [torch.float]),
+            )
+        case (
+            "add.Tensor"
+            | "sub.Tensor"
+            | "add.Scalar"
+            | "sub.Scalar"
+            | "mul.Scalar"
+            | "div.Scalar"
+        ):
+            tensor_constraints.append(
+                cp.Dtype.In(lambda deps: [torch.float, torch.int]),
+            )
+        case _:
+            tensor_constraints.append(
+                cp.Dtype.In(lambda deps: [torch.float, torch.int]),
+            )
+    tensor_constraints.extend(
+        [
+            cp.Value.Ge(lambda deps, dtype, struct: -(2**8)),
+            cp.Value.Le(lambda deps, dtype, struct: 2**8),
+            cp.Rank.Ge(lambda deps: 1),
+            cp.Rank.Le(lambda deps: 2**2),
+            cp.Size.Ge(lambda deps, r, d: 1),
+            cp.Size.Le(lambda deps, r, d: 2**2),
+        ]
+    )
+
+
+def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, str]]]:
+    # minimal example to test add.Tensor using FACTO
+    spec = SpecDictDB[op_name]
+
+    for index, in_spec in enumerate(copy.deepcopy(spec.inspec)):
+        if in_spec.type.is_scalar():
+            if in_spec.name != "alpha":
+                spec.inspec[index].constraints.extend(
+                    [
+                        cp.Dtype.In(lambda deps: [ScalarDtype.float, ScalarDtype.int]),
+                        cp.Value.Ge(lambda deps, dtype: -(2**8)),
+                        cp.Value.Le(lambda deps, dtype: 2**2),
+                        cp.Size.Ge(lambda deps, r, d: 1),
+                        cp.Size.Le(lambda deps, r, d: 2**2),
+                    ]
+                )
+            else:
+                spec.inspec[index].constraints.extend(
+                    [
+                        cp.Value.Gt(lambda deps, dtype: 0),
+                        cp.Value.Le(lambda deps, dtype: 2),
+                    ]
+                )
+        elif in_spec.type.is_tensor():
+            tensor_constraints = []
+            # common tensor constraints
+            apply_tensor_contraints(op_name, tensor_constraints)
+            spec.inspec[index].constraints.extend(tensor_constraints)
+
+    return [
+        (posargs, inkwargs)
+        for posargs, inkwargs, _ in ArgumentTupleGenerator(spec).gen()
+    ]
diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl
index e1fbeb9fdf..a646f0076b 100644
--- a/examples/cadence/operators/targets.bzl
+++ b/examples/cadence/operators/targets.bzl
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 
 TESTS_LIST = [
     "add_op",
@@ -16,6 +17,19 @@ def define_common_targets():
     for op in TESTS_LIST:
         _define_test_target(op)
 
+    python_library(
+        name = "facto_util",
+        srcs = [
+            "facto_util.py",
+        ],
+        typing = True,
+        deps = [
+            "fbcode//caffe2:torch",
+            "fbcode//pytorch/facto:inputgen",
+            "fbcode//pytorch/facto:specdb",
+        ],
+    )
+
 
 def _define_test_target(test_name):
     file_name = "test_{}".format(test_name)

From 04f764e2e21084fd271e3439ab2a609a00b6faf5 Mon Sep 17 00:00:00 2001
From: JP <46308822+zonglinpeng@users.noreply.github.com>
Date: Fri, 17 Jan 2025 10:56:28 -0800
Subject: [PATCH 39/40] fix typo in cadence cp quantized_conv_out

Differential Revision: D68278032

Pull Request resolved: https://github.com/pytorch/executorch/pull/7706
---
 backends/cadence/reference/operators/quantized_conv_out.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp
index 5a7af85809..b18159a0b3 100644
--- a/backends/cadence/reference/operators/quantized_conv_out.cpp
+++ b/backends/cadence/reference/operators/quantized_conv_out.cpp
@@ -119,7 +119,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
                     if (((_h + d0 * _wh - p0) >= 0) &&
                         ((_h + d0 * _wh - p0) < h) &&
                         ((_w + d1 * _ww - p1) >= 0) &&
-                        ((_w + d1 * _ww - p1 < w))) {
+                        ((_w + d1 * _ww - p1) < w)) {
                       int ioff =
                           (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
                       int woff = _wh * ww + _ww;

From 8494b9085605229954e869f5aa18e82b20c9ead0 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Fri, 17 Jan 2025 13:48:59 -0600
Subject: [PATCH 40/40] [ET-VK][ez] Fix `conv2d_pw` shared memory buffer not
 having a constant size (#7734)

## Context

`conv2d_pw` was failing to compile on Mac due to the shared memory array not having a constant size.

Simply hardcode the workgroup size to fix.

Differential Revision: [D68331984](https://our.internmc.facebook.com/intern/diff/D68331984/)

ghstack-source-id: 261911463
Pull Request resolved: https://github.com/pytorch/executorch/pull/7729

Co-authored-by: Stephen Jia <ssjia@meta.com>
---
 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index a5a2097cd5..f72c487fa7 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -33,7 +33,9 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 // shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
+// 64 is the number of threads in the local wg
+$num_shared = 64 * TILE_SIZE * TILE_SIZE
+shared ivec2 pos_shared[${num_shared}];
 
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an