From 8a5f52b9e1ed082dc21aaf1d6b5f9c2646620add Mon Sep 17 00:00:00 2001 From: Daniil Lyakhov Date: Tue, 14 Jan 2025 15:44:39 -0800 Subject: [PATCH 01/40] Typo in coreml README.md (#7586) * Typo in coreml README.md * int8 -> qint8, uint8 -> quint8 --- backends/apple/coreml/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md index b3b22ed999..e8a062774d 100644 --- a/backends/apple/coreml/README.md +++ b/backends/apple/coreml/README.md @@ -93,14 +93,14 @@ class Model(torch.nn.Module): source_model = Model() example_inputs = (torch.randn((1, 3, 256, 256)), ) -pre_autograd_aten_dialect = export_for_training(model, example_inputs).module() +pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module() quantization_config = LinearQuantizerConfig.from_dict( { "global_config": { "quantization_scheme": QuantizationScheme.symmetric, - "activation_dtype": torch.uint8, - "weight_dtype": torch.int8, + "activation_dtype": torch.quint8, + "weight_dtype": torch.qint8, "weight_per_channel": True, } } From 7fa4b87adabe203d383ff4a9208ef2f94f47d676 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Tue, 14 Jan 2025 18:49:50 -0600 Subject: [PATCH 02/40] [executorch][flat_tensor] Serialize flat tensor (#7641) Pull Request resolved: https://github.com/pytorch/executorch/pull/7268 Serialize a flat tensor file. The resulting file looks like: Header containing: - flatbuffer offset and size - segment data offset and size Flatbuffer containing: - Items described in [flat_tensor.fbs](https://www.internalfb.com/code/fbsource/[079ba95593be856a16783bd3f3b3579580595fbb]/fbcode/executorch/extension/flat_tensor/flat_tensor.fbs) Tensor data (in segment) - Raw tensor data ghstack-source-id: 261273078 @exported-using-ghexport Differential Revision: [D66374253](https://our.internmc.facebook.com/intern/diff/D66374253/) Co-authored-by: lucylq --- exir/_serialize/TARGETS | 1 + extension/flat_tensor/__init__.py | 0 extension/flat_tensor/serialize/TARGETS | 18 ++ extension/flat_tensor/serialize/__init__.py | 0 .../serialize/flat_tensor_schema.py | 2 +- extension/flat_tensor/serialize/serialize.py | 290 ++++++++++++++++++ extension/flat_tensor/test/TARGETS | 14 + extension/flat_tensor/test/test_serialize.py | 85 +++++ 8 files changed, 409 insertions(+), 1 deletion(-) create mode 100644 extension/flat_tensor/__init__.py create mode 100644 extension/flat_tensor/serialize/__init__.py create mode 100644 extension/flat_tensor/serialize/serialize.py create mode 100644 extension/flat_tensor/test/TARGETS create mode 100644 extension/flat_tensor/test/test_serialize.py diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS index 4ce219d950..cd6a4bc5a2 100644 --- a/exir/_serialize/TARGETS +++ b/exir/_serialize/TARGETS @@ -33,6 +33,7 @@ runtime.python_library( "_dataclass.py", "_flatbuffer.py", "_program.py", + "data_serializer.py", "padding.py", ], resources = { diff --git a/extension/flat_tensor/__init__.py b/extension/flat_tensor/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/extension/flat_tensor/serialize/TARGETS b/extension/flat_tensor/serialize/TARGETS index c3acdca054..229f6930f4 100644 --- a/extension/flat_tensor/serialize/TARGETS +++ b/extension/flat_tensor/serialize/TARGETS @@ -14,3 +14,21 @@ runtime.python_library( "//executorch/...", ], ) + +runtime.python_library( + name = "serialize", + srcs = [ + "serialize.py", + ], + resources = [ + "flat_tensor.fbs", + "scalar_type.fbs", + ], + visibility = [ + "//executorch/...", + ], + deps = [ + ":schema", + "//executorch/exir/_serialize:lib", + ], +) diff --git a/extension/flat_tensor/serialize/__init__.py b/extension/flat_tensor/serialize/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/extension/flat_tensor/serialize/flat_tensor_schema.py b/extension/flat_tensor/serialize/flat_tensor_schema.py index 091ce1178b..818963d05b 100644 --- a/extension/flat_tensor/serialize/flat_tensor_schema.py +++ b/extension/flat_tensor/serialize/flat_tensor_schema.py @@ -18,7 +18,7 @@ class TensorMetadata: fully_qualified_name: str scalar_type: ScalarType sizes: List[int] - dim_order: List[bytes] + dim_order: List[int] segment_index: int offset: int diff --git a/extension/flat_tensor/serialize/serialize.py b/extension/flat_tensor/serialize/serialize.py new file mode 100644 index 0000000000..9e3df6aafc --- /dev/null +++ b/extension/flat_tensor/serialize/serialize.py @@ -0,0 +1,290 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import json +import os +import tempfile +from dataclasses import dataclass +from typing import ClassVar, Dict, List, Literal, Optional + +import pkg_resources +from executorch.exir._serialize._cord import Cord +from executorch.exir._serialize._dataclass import _DataclassEncoder + +from executorch.exir._serialize._flatbuffer import _flatc_compile +from executorch.exir._serialize.data_serializer import DataPayload, DataSerializer + +from executorch.exir._serialize.padding import aligned_size, pad_to, padding_required + +# Byte order of numbers written to flat tensor headers. Always little-endian +# regardless of the host system, since all commonly-used modern CPUs are little +# endian. +_HEADER_BYTEORDER: Literal["little"] = "little" + +from executorch.extension.flat_tensor.serialize.flat_tensor_schema import ( + DataSegment, + FlatTensor, + TensorMetadata, +) + + +def _convert_to_flatbuffer(flat_tensor: FlatTensor) -> Cord: + """Converts a FlatTensor to a flatbuffer and returns the serialized data.""" + flat_tensor_json = json.dumps(flat_tensor, cls=_DataclassEncoder) + with tempfile.TemporaryDirectory() as d: + schema_path = os.path.join(d, "flat_tensor.fbs") + with open(schema_path, "wb") as schema_file: + schema_file.write( + pkg_resources.resource_string(__name__, "flat_tensor.fbs") + ) + scalar_type_path = os.path.join(d, "scalar_type.fbs") + with open(scalar_type_path, "wb") as scalar_type_file: + scalar_type_file.write( + pkg_resources.resource_string(__name__, "scalar_type.fbs") + ) + json_path = os.path.join(d, "flat_tensor.json") + with open(json_path, "wb") as json_file: + json_file.write(flat_tensor_json.encode("ascii")) + + _flatc_compile(d, schema_path, json_path) + output_path = os.path.join(d, "flat_tensor.ptd") + with open(output_path, "rb") as output_file: + return Cord(output_file.read()) + + +@dataclass +class FlatTensorConfig: + tensor_alignment: int = 16 + segment_alignment: int = 16 + + +@dataclass +class FlatTensorHeader: + # Class constants. + # The magic bytes that should be at the beginning of the header. + EXPECTED_MAGIC: ClassVar[bytes] = b"FH01" + EXPECTED_LENGTH: ClassVar[int] = ( + # Header magic + 4 + # Header length + + 4 + # Flatbuffer offset + + 8 + # Flatbuffer data size + + 8 + # Segment base offset + + 8 + # Data size + + 8 + ) + + # Instance attributes. @dataclass will turn these into ctor args. + + # Offset to the start of the flatbuffer data, in bytes. + flatbuffer_offset: int + # The size of the serialized data in bytes. + flatbuffer_size: int + # Offset to the start of the first segment, or zero if there + # are no segments. + segment_base_offset: int + # Size of all the segment data, in bytes. + segment_data_size: int + + # The magic bytes read from or to be written to the binary header. + magic: bytes = EXPECTED_MAGIC + # The header length, in bytes, read from or to be written to the binary + # header. + length: int = EXPECTED_LENGTH + + @staticmethod + def from_bytes(data: bytes) -> "FlatTensorHeader": + """Tries to read an flat_tensor header from the provided data. + + Does not validate that the header is well-formed. Callers should + use is_valid(). + + Args: + data: The data to read from. + Returns: + The contents of the flat_tensor header. + Raises: + ValueError: If not enough data is provided. + """ + if len(data) < FlatTensorHeader.EXPECTED_LENGTH: + raise ValueError( + f"Not enough data for flat_tensor header: {len(data)} " + + f"< {FlatTensorHeader.EXPECTED_LENGTH}" + ) + + return FlatTensorHeader( + magic=data[0:4], + length=int.from_bytes(data[4:8], byteorder=_HEADER_BYTEORDER), + flatbuffer_offset=int.from_bytes(data[8:16], byteorder=_HEADER_BYTEORDER), + flatbuffer_size=int.from_bytes(data[16:24], byteorder=_HEADER_BYTEORDER), + segment_base_offset=int.from_bytes( + data[24:32], byteorder=_HEADER_BYTEORDER + ), + segment_data_size=int.from_bytes(data[32:40], byteorder=_HEADER_BYTEORDER), + ) + + def is_valid(self) -> bool: + """Returns true if the flat_tensor header appears to be well-formed.""" + return ( + self.magic == FlatTensorHeader.EXPECTED_MAGIC + and self.length >= FlatTensorHeader.EXPECTED_LENGTH + ) + + def to_bytes(self) -> bytes: + """Returns the binary representation of the flat_tensor header. + + Note that this will ignore self.magic and self.length and will always + write the proper magic/length. + """ + data: bytes = ( + # Extended header magic. This lets consumers detect whether the + # header was inserted or not. Always use the proper magic value + # (i.e., ignore self.magic) since there's no reason to create an + # invalid header. + self.EXPECTED_MAGIC + # uint32_t: Size of this header. This makes it easier to add new + # fields to this header in the future. Always use the proper size + # (i.e., ignore self.length) since there's no reason to create an + # invalid header. + + self.EXPECTED_LENGTH.to_bytes(4, byteorder=_HEADER_BYTEORDER) + # uint64_t: Offset to the start of the flatbuffer data, in bytes. + + self.flatbuffer_offset.to_bytes(8, byteorder=_HEADER_BYTEORDER) + # uint64_t: Size of the serialized data in bytes. + + self.flatbuffer_size.to_bytes(8, byteorder=_HEADER_BYTEORDER) + # uint64_t: Offset to the start of the first segment, or zero if + # there are no segments. + + self.segment_base_offset.to_bytes(8, byteorder=_HEADER_BYTEORDER) + # uint64_t: Size of all the segment data, in bytes. + + self.segment_data_size.to_bytes(8, byteorder=_HEADER_BYTEORDER) + ) + return data + + +class FlatTensorSerializer(DataSerializer): + """A concrete implementation of the DataSerializer interface that + serializes and deserializes data to/from the FlatTensor format. + """ + + def __init__(self, config: Optional[FlatTensorConfig] = None) -> None: + """FlatTensorConfig holds information required for serialization, + eg. alignment. + """ + if config is None: + self.config: FlatTensorConfig = FlatTensorConfig() + else: + self.config: FlatTensorConfig = config + + def serialize( + self, + data: DataPayload, + ) -> Cord: + """Serializes a list of tensor metadata and tensors into a blob.""" + + flat_tensor_metadata: List[TensorMetadata] = [] + flat_tensor_data: Cord = Cord() + + # {idx, offset} + saved_offsets: Dict[int, int] = {} + + for fqn, tensor_entry in data.fqn_to_tensor.items(): + assert tensor_entry.layout is not None + # Check index into the tensor buffers is valid. + assert tensor_entry.buffer_index < len( + data.buffers + ), f"Invalid index {tensor_entry.buffer_index} is greater than tensor buffer size {len(data.buffers)}." + + # Check if the tensor has already been appended to the flat_tensor_data. + offset = saved_offsets.get(tensor_entry.buffer_index, -1) + if offset == -1: + if len(flat_tensor_data) > 0: + # Add padding to round off the previous tensor offset. + pad_length = padding_required( + len(flat_tensor_data), self.config.tensor_alignment + ) + flat_tensor_data.append(b"\x00" * pad_length) + # Add to saved offsets. + offset = len(flat_tensor_data) + saved_offsets[tensor_entry.buffer_index] = offset + # Append to flat_tensor_data at the offset. + flat_tensor_data.append(data.buffers[tensor_entry.buffer_index]) + + flat_tensor_metadata.append( + TensorMetadata( + fully_qualified_name=fqn, + scalar_type=tensor_entry.layout.scalar_type, + sizes=tensor_entry.layout.sizes, + dim_order=tensor_entry.layout.dim_order, + segment_index=0, + offset=offset, + ) + ) + + # Pad flat_tensor_data to segment alignment. + segment_pad_length = padding_required( + len(flat_tensor_data), self.config.segment_alignment + ) + if segment_pad_length > 0: + flat_tensor_data.append(b"\x00" * segment_pad_length) + + # Create FlatTensor, which describes of the contents of the file and + # points to all the data segments. It will be serialized to flatbuffer. + flat_tensor = FlatTensor( + version=0, + tensor_alignment=self.config.tensor_alignment, + tensors=flat_tensor_metadata, + segments=[DataSegment(offset=0, size=len(flat_tensor_data))], + ) + + flatbuffer_payload = _convert_to_flatbuffer(flat_tensor) + padded_flatbuffer_length: int = aligned_size( + input_size=len(flatbuffer_payload), + alignment=self.config.tensor_alignment, + ) + + padded_header_length: int = aligned_size( + input_size=FlatTensorHeader.EXPECTED_LENGTH, + alignment=self.config.tensor_alignment, + ) + + segment_base_offset = aligned_size( + padded_flatbuffer_length + padded_header_length, + self.config.segment_alignment, + ) + + # Create FlatTensorHeader, which stores the offsets and sizes of the + # FlatTensor flatbuffer and the segment data. + header_data: bytes = FlatTensorHeader( + flatbuffer_offset=padded_header_length, + flatbuffer_size=len(flatbuffer_payload), + segment_base_offset=segment_base_offset, + segment_data_size=len(flat_tensor_data), + ).to_bytes() + + # Pad header and payload to segment alignment. + header_data = pad_to(header_data, padded_header_length) + flatbuffer_payload.append( + b"\x00" * (padded_flatbuffer_length - len(flatbuffer_payload)) + ) + + # Place everything into one segment. + payload = Cord() + payload.append(header_data) + payload.append(flatbuffer_payload) + payload.append(flat_tensor_data) + + return payload + + def deserialize(self, blob: Cord) -> DataPayload: + """ + Deserializes a flat_tensor blob into a list of tensor metadata and tensors. + """ + raise NotImplementedError("deserialize_data") diff --git a/extension/flat_tensor/test/TARGETS b/extension/flat_tensor/test/TARGETS new file mode 100644 index 0000000000..6f708ae848 --- /dev/null +++ b/extension/flat_tensor/test/TARGETS @@ -0,0 +1,14 @@ +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("executorch") + +python_unittest( + name = "serialize", + srcs = [ + "test_serialize.py", + ], + deps = [ + "//executorch/extension/flat_tensor/serialize:serialize", + "//executorch/extension/flat_tensor/serialize:schema", + ], +) diff --git a/extension/flat_tensor/test/test_serialize.py b/extension/flat_tensor/test/test_serialize.py new file mode 100644 index 0000000000..d023567274 --- /dev/null +++ b/extension/flat_tensor/test/test_serialize.py @@ -0,0 +1,85 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import unittest + +from executorch.exir._serialize.data_serializer import ( + DataPayload, + DataSerializer, + TensorEntry, + TensorLayout, +) + +from executorch.exir._serialize.padding import aligned_size + +from executorch.exir.schema import ScalarType + +from executorch.extension.flat_tensor.serialize.serialize import ( + FlatTensorConfig, + FlatTensorHeader, + FlatTensorSerializer, +) + +# Test artifacts. +TEST_TENSOR_BUFFER = [b"tensor"] +TEST_TENSOR_MAP = { + "fqn1": TensorEntry( + buffer_index=0, + layout=TensorLayout( + scalar_type=ScalarType.FLOAT, + sizes=[1, 1, 1], + dim_order=[0, 1, 2], + ), + ), + "fqn2": TensorEntry( + buffer_index=0, + layout=TensorLayout( + scalar_type=ScalarType.FLOAT, + sizes=[1, 1, 1], + dim_order=[0, 1, 2], + ), + ), +} +TEST_DATA_PAYLOAD = DataPayload( + buffers=TEST_TENSOR_BUFFER, + fqn_to_tensor=TEST_TENSOR_MAP, +) + + +class TestSerialize(unittest.TestCase): + def test_serialize(self) -> None: + config = FlatTensorConfig() + serializer: DataSerializer = FlatTensorSerializer(config) + + data = bytes(serializer.serialize(TEST_DATA_PAYLOAD)) + + header = FlatTensorHeader.from_bytes(data[0 : FlatTensorHeader.EXPECTED_LENGTH]) + self.assertTrue(header.is_valid()) + + # Header is aligned to config.segment_alignment, which is where the flatbuffer starts. + self.assertEqual( + header.flatbuffer_offset, + aligned_size(FlatTensorHeader.EXPECTED_LENGTH, config.segment_alignment), + ) + + # Flatbuffer is non-empty. + self.assertTrue(header.flatbuffer_size > 0) + + # Segment base offset is aligned to config.segment_alignment. + expected_segment_base_offset = aligned_size( + header.flatbuffer_offset + header.flatbuffer_size, config.segment_alignment + ) + self.assertTrue(header.segment_base_offset, expected_segment_base_offset) + + # TEST_TENSOR_BUFFER is aligned to config.segment_alignment. + self.assertEqual(header.segment_data_size, config.segment_alignment) + + # Confirm the flatbuffer magic is present. + self.assertEqual( + data[header.flatbuffer_offset + 4 : header.flatbuffer_offset + 8], b"FT01" + ) From 0b81bb693d9b43cd84a6728aa3b7ab376139ad03 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:55:50 -0800 Subject: [PATCH 03/40] Additional QNN version fix (#7664) Co-authored-by: Guang Yang --- .github/workflows/android-perf.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 1874a4fd6e..f2a289e230 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -260,7 +260,7 @@ jobs: --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then - export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 + export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/ export PYTHONPATH=$(pwd)/.. @@ -347,7 +347,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh export ANDROID_ABIS="arm64-v8a" - PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} + PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME} # Let's see how expensive this job is, we might want to tone it down by running it periodically benchmark-on-device: From d9992040744bb1a5da4cffb1727086b027766e5e Mon Sep 17 00:00:00 2001 From: lucylq Date: Tue, 14 Jan 2025 19:05:33 -0800 Subject: [PATCH 04/40] fix-up (#7665) --- .../runtime/graph/ops/glsl/conv2d_dw.glsl | 2 +- .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 2 +- .../runtime/graph/ops/glsl/conv2d_pw.glsl | 2 +- .../graph/ops/glsl/indexing_utils_u16.h | 19 ------------------- 4 files changed, 3 insertions(+), 22 deletions(-) delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl index 23cbb1b652..103f3cfdd7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl @@ -14,7 +14,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils_u16.h" +#include "indexing_utils.h" layout(std430) buffer; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 48afd3a9a7..9e69fdd1fe 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -22,7 +22,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils_u16.h" +#include "indexing_utils.h" layout(std430) buffer; diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index b50a892cad..a5a2097cd5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -16,7 +16,7 @@ #define op(X, A, B) ${OPERATOR} -#include "indexing_utils_u16.h" +#include "indexing_utils.h" layout(std430) buffer; diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h deleted file mode 100644 index 6dc59b6303..0000000000 --- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils_u16.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#ifndef INDEXING_UTILS_U16_H -#define INDEXING_UTILS_U16_H - -#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require - -u16vec3 idx_to_u16pos_x_wise(uint idx, int size_x, int size_y) { - const uint div_by_x = idx / size_x; - return u16vec3(idx % size_x, div_by_x % size_y, div_by_x / size_y); -} - -#endif // INDEXING_UTILS_U16_H From 24f0d34dcddf7e050b601b55e3e310f72568399a Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Tue, 14 Jan 2025 21:11:53 -0800 Subject: [PATCH 05/40] Log dtype names on input dtype mismatch (#7537) Log dtype names on input dtype mismatch (#7537) Summary: Update the error message when input tensor scalar type is incorrect. We've seen this get hit a few times and it should be easier to debug than it is. New Message: ``` [method.cpp:834] Input 0 has unexpected scalar type: expected Float but was Byte. ``` Old Message: ``` [method.cpp:826] The 0-th input tensor's scalartype does not meet requirement: found 0 but expected 6 ``` Test Plan: Built executorch bento kernel locally and tested with an incorrect scalar type to view the new error message. ``` [method.cpp:834] Input 0 has unexpected scalar type: expected Float but was Byte. ``` I also locally patched and built the bento kernel with ET_ENABLE_ENUM_STRINGS=0. ``` [method.cpp:834] Input 0 has unexpected scalar type: expected 6 but was 0. ``` Differential Revision: D67887770 Pulled By: GregoryComer --- runtime/executor/method.cpp | 8 ++++---- runtime/executor/targets.bzl | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index c9563d3ae5..c539613e6b 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -840,14 +840,14 @@ Method::set_input(const EValue& input_evalue, size_t input_idx) { if (e.isTensor()) { const auto& t_dst = e.toTensor(); const auto& t_src = input_evalue.toTensor(); + ET_CHECK_OR_RETURN_ERROR( t_dst.scalar_type() == t_src.scalar_type(), InvalidArgument, - "The %zu-th input tensor's scalartype does not meet requirement: found %" PRId8 - " but expected %" PRId8, + "Input %zu has unexpected scalar type: expected %s but was %s.", input_idx, - static_cast(t_src.scalar_type()), - static_cast(t_dst.scalar_type())); + executorch::runtime::toString(t_dst.scalar_type()), + executorch::runtime::toString(t_src.scalar_type())); // Reset the shape for the Method's input as the size of forwarded input // tensor for shape dynamism. Also is a safety check if need memcpy. Error err = resize_tensor(t_dst, t_src.sizes()); diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index cc91255d7b..158da5d108 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -82,6 +82,7 @@ def define_common_targets(): "//executorch/runtime/core:evalue" + aten_suffix, "//executorch/runtime/core:event_tracer" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, + "//executorch/runtime/core/exec_aten/util:scalar_type_util" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, "//executorch/runtime/kernel:kernel_runtime_context" + aten_suffix, "//executorch/runtime/kernel:operator_registry", From 3ef100dd7c0099ad9c81e0d75571f065d7d46adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Per=20=C3=85strand?= Date: Wed, 15 Jan 2025 09:34:06 +0100 Subject: [PATCH 06/40] Use ArmQuantizer to quantize bias (#7649) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the 'manual' quantization of bias parameter and let the quantizer handle the quantization instead. Signed-off-by: Per Åstrand --- backends/arm/process_node.py | 66 +++---------------- .../arm/quantizer/quantization_annotator.py | 4 +- backends/arm/quantizer/quantization_config.py | 40 ++++++++++- backends/arm/test/misc/test_debug_feats.py | 4 +- backends/arm/tosa_utils.py | 15 +---- 5 files changed, 51 insertions(+), 78 deletions(-) diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py index 6aa663b81e..9ab9c49044 100644 --- a/backends/arm/process_node.py +++ b/backends/arm/process_node.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -11,11 +11,6 @@ import serializer.tosa_serializer as ts import torch import torch.fx - -# pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.' -from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( - get_input_qparams, -) from executorch.backends.arm.operators.node_visitor import NodeVisitor from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg from executorch.backends.arm.tosa_quant_utils import ( @@ -24,11 +19,7 @@ is_node_quantized, ) from executorch.backends.arm.tosa_specification import TosaSpecification -from executorch.backends.arm.tosa_utils import ( - getNodeArgs, - is_bias_node_for_quantized_conv, - tosa_shape, -) +from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape from torch.export.exported_program import ExportedProgram @@ -99,41 +90,6 @@ def process_inputs( tosa_graph.addInputTensor(tensor) -def process_quantized_bias( - node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, - parameter_values, -): - """ - Serialize bias node that needs to be quantized. - """ - consumer_node = list(node.users)[0] - ( - input_node, - weight_node, - _, - ) = consumer_node.all_input_nodes - - input_qargs = get_input_qparams( # pyre-ignore[16]: Module `executorch.backends.arm` has no attribute `_passes`. - consumer_node - ) - - input_node_scale = input_qargs[0].scale - weight_node_scale = input_qargs[1].scale - bias_values_quantized = ( - (parameter_values / (input_node_scale * weight_node_scale)) - .round() - .astype(np.int32) - ) - - tosa_graph.addConst( - bias_values_quantized.shape, - ts.DType.INT32, - bias_values_quantized, - name=node.name, - ) - - def process_inputs_to_parameters( node: torch.fx.Node, tosa_graph: ts.TosaSerializer, @@ -148,20 +104,14 @@ def process_inputs_to_parameters( assert isinstance(parameter_data, torch.Tensor), "Expect Attr to be tensor" parameter_values = parameter_data.detach().numpy() - if is_bias_node_for_quantized_conv(node): - # BI bias - assert tosa_spec.support_integer(), f"{tosa_spec} doesnt't support integer" - process_quantized_bias(node, tosa_graph, parameter_values) - else: - # MI weights or bias - if inputs[0].dtype == torch.float32: - assert tosa_spec.support_float(), f"{tosa_spec} doesn't support float" + if inputs[0].dtype == torch.float32: + assert tosa_spec.support_float(), f"{tosa_spec} doesn't support float" - parameter_values = np.transpose(parameter_values, inputs[0].dim_order) + parameter_values = np.transpose(parameter_values, inputs[0].dim_order) - tosa_graph.addConst( - parameter_values.shape, inputs[0].dtype, parameter_values, name=node.name - ) + tosa_graph.addConst( + parameter_values.shape, inputs[0].dtype, parameter_values, name=node.name + ) def process_inputs_to_buffers( diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py index 9ddeb61c30..9c4187d32a 100644 --- a/backends/arm/quantizer/quantization_annotator.py +++ b/backends/arm/quantizer/quantization_annotator.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -196,7 +196,7 @@ def get_quant_properties( # noqa: C901 input_act_qspec = quantization_config.get_input_act_qspec() weight_qspec = quantization_config.get_weight_qspec() output_act_qspec = quantization_config.get_output_act_qspec() - bias_qspec = quantization_config.get_bias_qspec() + bias_qspec = quantization_config.get_bias_qspec(node) quant_properties = _OpQuantProperties() diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py index 1e776d37a6..b94d9bda64 100644 --- a/backends/arm/quantizer/quantization_config.py +++ b/backends/arm/quantizer/quantization_config.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -9,8 +9,10 @@ from dataclasses import dataclass import torch +from torch.ao.quantization import ObserverOrFakeQuantize from torch.ao.quantization.quantizer import ( + DerivedQuantizationSpec, FixedQParamsQuantizationSpec, QuantizationSpec, ) @@ -53,8 +55,42 @@ def get_weight_qspec(self) -> QuantizationSpec | None: ], f"Unsupported quantization_spec {self.weight} for weight" return self.weight - def get_bias_qspec(self) -> QuantizationSpec | None: + def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None: """Returns QuantizationSpec 'bias' after asserting that bias.dtype is torch.float.""" + + def _derive_qparams_fn( + obs_or_fqs: list[ObserverOrFakeQuantize], + ) -> tuple[torch.Tensor, torch.Tensor]: + assert ( + len(obs_or_fqs) == 2 + ), "Expecting two obs/fqs, one for activation and one for weight, got: {}".format( + len(obs_or_fqs) + ) + act_obs_or_fq = obs_or_fqs[0] + weight_obs_or_fq = obs_or_fqs[1] + act_scale, act_zp = act_obs_or_fq.calculate_qparams() + weight_scale, weight_zp = weight_obs_or_fq.calculate_qparams() + return torch.tensor([act_scale * weight_scale]).to( + torch.float32 + ), torch.tensor([0]).to(torch.int32) + + if node.target in [ + torch.ops.aten.conv1d.default, + torch.ops.aten.conv2d.default, + torch.ops.aten.linear.default, + ]: + input_act = node.args[0] + weight = node.args[1] + quantization_spec = DerivedQuantizationSpec( + derived_from=[(input_act, node), (weight, node)], + derive_qparams_fn=_derive_qparams_fn, + dtype=torch.int32, + quant_min=torch.iinfo(torch.int32).min, + quant_max=torch.iinfo(torch.int32).max - 1, + qscheme=torch.per_tensor_symmetric, + ) + return quantization_spec + if self.bias is None: return None assert ( diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index b2fc271ade..a9491418a4 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -197,10 +197,10 @@ def test_collate_tosa_BI_tests(self): "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests" ) assert os.path.exists( - "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag5.tosa" + "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6.tosa" ) assert os.path.exists( - "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag5.json" + "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6.json" ) os.environ.pop("TOSA_TESTCASES_BASE_PATH") diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index 5bda9bbf18..c03e0ef0bb 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024 Arm Limited and/or its affiliates. +# Copyright 2023-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -133,19 +133,6 @@ def build_reshape(tosa_fb, input_name, new_shape, output_name): tosa_fb.addOperator(TosaOp.Op().RESHAPE, [input_name], [output_name], attr) -def is_bias_node_for_quantized_conv(node): - consumer_node = list(node.users)[0] - - if ( - consumer_node.target == exir_ops.edge.aten.convolution.default - and consumer_node.args[2] == node - and consumer_node.meta["val"].dtype == torch.int8 - ): - return True - - return False - - def is_consumer_node_depthwise_conv2d(node): consumer_node = list(node.users)[0] if consumer_node.target == exir_ops.edge.aten.convolution.default: From 85d274a3d20b6aae43372f2bec40c5f43535775c Mon Sep 17 00:00:00 2001 From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com> Date: Wed, 15 Jan 2025 10:00:44 +0100 Subject: [PATCH 07/40] Arm backend: Size adjust conv2d pass improvements (#7646) [Arm backend] Improve the documentation of the size adjust conv2d pass and remove duplicated code. Also add more tests to conv1d and conv2d that need to go through the pass. --- .../arm/_passes/size_adjust_conv2d_pass.py | 103 +++++++++-------- backends/arm/test/ops/test_conv1d.py | 45 ++++++++ backends/arm/test/ops/test_conv2d.py | 105 +++++++++++++++++- 3 files changed, 200 insertions(+), 53 deletions(-) diff --git a/backends/arm/_passes/size_adjust_conv2d_pass.py b/backends/arm/_passes/size_adjust_conv2d_pass.py index 08da9a74c9..ee81127343 100644 --- a/backends/arm/_passes/size_adjust_conv2d_pass.py +++ b/backends/arm/_passes/size_adjust_conv2d_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -6,68 +6,69 @@ # pyre-unsafe -from typing import cast, Optional +from typing import cast import torch.fx +from executorch.backends.arm._passes.arm_pass_utils import create_node from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -from torch._ops import OpOverload def conv_remainder(input_length, pad, dilation, weight, stride): """ - Returns the size + Returns the remainder of input_length; given the padding, dilation, stride, + and kernel size. """ return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride -def insert_q_dq_pair( - graph: torch.fx.Graph, - anchor: torch.fx.Node, - q_params: tuple, -): - with graph.inserting_after(anchor): - q = create_node( - graph=graph, - op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default, - args=(), # We add the argument last - ) - q.meta = anchor.meta - - with graph.inserting_after(q): - dq = create_node( - graph=graph, - op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default, - args=(q,) + q_params, - ) - dq.meta = q.meta - - anchor.replace_all_uses_with(dq) - # We add this last so the replace all uses above does not replace the quantized - # node's first use - q.args = (anchor,) + q_params - return dq - - -def create_node( - graph: torch.fx.Graph, - op_target: OpOverload, - args: tuple = (), - kwargs: Optional[dict] = None, -): - return graph.create_node( - "call_function", - op_target, - args=args, - kwargs=kwargs or {}, - ) - - class SizeAdjustConv2DPass(ExportPass): """ - Adjust the convolution input size to match perfectly with the - weight size, padding, stride and dilation parameters. - This is done by inserting a slice op to remove the uneven end of the input. + Adjust the convolution input size to match the kernel size, padding, stride, + and dilation parameters. Pytorch allows the input and kernel shape to not + "match", in which case the remaining rows/columns are truncated. However, + matching the size is a requirement in the TOSA specification. In case the + input and kernel shape do not match, the following is done to meet the + specification: + + 1) The padding is truncated (done in the node visitor) + 2) (if neccessary) The input is truncated (done in this pass)." + + A simple example would be a 2x2 kernel (no padding, stride=2) and a 5x5 + input: + + ┌───┬───┬───┬───┬───┐ ┌───┬───┬───┬───┬───┐ ┌───┬───┬───┬───┬───┐ + │ X │ X │ │ │ │ │ │ │ X │ X │ │ │ │ │ │ │ - │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ X │ X │ │ │ │ │ │ │ X │ X │ │ │ │ │ │ │ - │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ │ │ │ -> │ │ │ │ │ │ -> │ X │ X │ │ │ │ -> + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ │ │ │ │ │ │ │ │ │ │ X │ X │ │ │ │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ + └───┴───┴───┴───┴───┘ └───┴───┴───┴───┴───┘ └───┴───┴───┴───┴───┘ + First pass second pass third pass + + ┌───┬───┬───┬───┬───┐ ┌───┬───┬───┬───┬───┐ + │ │ │ │ │ │ │ │ │ │ │ - │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ │ │ │ │ │ │ │ │ - │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ X │ X │ │ -> │ │ │ │ │ - │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ X │ X │ │ │ │ │ │ │ - │ + ├───┼───┼───┼───┼───┤ ├───┼───┼───┼───┼───┤ + │ │ │ │ │ │ │ - │ - │ - │ - │ - │ + └───┴───┴───┴───┴───┘ └───┴───┴───┴───┴───┘ + Fourth pass Unvisited cells + + Cells that are never visited are marked with `-` and are never considered + when the kernel traverses over the input, hence they can be removed. + + To match the shape of the kernel (and all parameters) with the input, a + slice op is inserted to remove the remaining edges (rows and columns) of the + input. """ conv2d_op = exir_ops.edge.aten.convolution.default @@ -109,9 +110,7 @@ def call(self, graph_module: torch.fx.GraphModule): with graph_module.graph.inserting_before(node): last_node = cast(torch.fx.Node, input_node) for args in slice_args: - slice_node = graph.create_node( - "call_function", self.slice_op, (last_node,) + args - ) + slice_node = create_node(graph, self.slice_op, (last_node,) + args) last_node = slice_node conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node) modified_graph = True diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py index b754a91f36..3e0dfa6c5c 100644 --- a/backends/arm/test/ops/test_conv1d.py +++ b/backends/arm/test/ops/test_conv1d.py @@ -180,6 +180,47 @@ def forward(self, x): batches=1, ) +conv1d_7_1x3x16_st2_pd1_dl2 = Conv1d( + in_channels=3, + out_channels=3, + kernel_size=7, + stride=2, + padding=1, + dilation=2, + length=16, + batches=1, +) +conv1d_7_1x3x15_st1_pd0_dl1 = Conv1d( + in_channels=3, + out_channels=3, + kernel_size=7, + stride=1, + padding=0, + dilation=1, + length=15, + batches=1, +) +conv1d_5_1x3x14_st5_pd0_dl1 = Conv1d( + in_channels=3, + out_channels=3, + kernel_size=5, + stride=5, + padding=0, + dilation=1, + length=14, + batches=1, +) +conv1d_5_1x3x9_st5_pd0_dl1 = Conv1d( + in_channels=3, + out_channels=3, + kernel_size=5, + stride=5, + padding=0, + dilation=1, + length=9, + batches=1, +) + two_conv1d_nobias = Conv1d( nbr_conv=2, length=256, @@ -214,6 +255,10 @@ def forward(self, x): ("2_1x2x14_st2", conv1d_2_1x2x14_st2), ("5_3x2x128_st1", conv1d_5_3x2x128_st1), ("3_1x3x224_st2_pd1", conv1d_3_1x3x224_st2_pd1), + ("7_1x3x16_st2_pd1_dl2_needs_adjust_pass", conv1d_7_1x3x16_st2_pd1_dl2), + ("7_1x3x15_st1_pd0_dl1_needs_adjust_pass", conv1d_7_1x3x15_st1_pd0_dl1), + ("5_1x3x14_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x14_st5_pd0_dl1), + ("5_1x3x9_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x9_st5_pd0_dl1), ("two_conv1d_nobias", two_conv1d_nobias), ("two_conv1d", two_conv1d), ] diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py index bbcb421ce7..b80228c6f2 100644 --- a/backends/arm/test/ops/test_conv2d.py +++ b/backends/arm/test/ops/test_conv2d.py @@ -201,6 +201,101 @@ def forward(self, x): batches=1, ) +conv2d_7x7_1x3x16x16_st2_pd1_dl2 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(7, 7), + stride=2, + padding=1, + dilation=2, + width=16, + height=16, + batches=1, +) + +conv2d_7x7_1x3x15x15_st1_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(7, 7), + stride=1, + padding=0, + dilation=1, + width=15, + height=15, + batches=1, +) + +conv2d_5x5_1x3x14x14_st5_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(5, 5), + stride=5, + padding=0, + dilation=1, + width=14, + height=14, + batches=1, +) + +conv2d_5x5_1x3x9x9_st5_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(5, 5), + stride=5, + padding=0, + dilation=1, + width=9, + height=9, + batches=1, +) + +conv2d_3x3_1x3x8x9_st3_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(3, 3), + stride=3, + padding=0, + dilation=1, + width=8, + height=9, + batches=1, +) + +conv2d_3x3_1x3x9x8_st3_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(3, 3), + stride=3, + padding=0, + dilation=1, + width=8, + height=9, + batches=1, +) + +conv2d_3x4_1x3x7x7_st3_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(3, 4), + stride=3, + padding=0, + dilation=1, + width=7, + height=7, + batches=1, +) + +conv2d_4x3_1x3x7x7_st3_pd0_dl1 = Conv2d( + in_channels=3, + out_channels=3, + kernel_size=(4, 3), + stride=3, + padding=0, + dilation=1, + width=7, + height=7, + batches=1, +) two_conv2d_nobias = Conv2d( nbr_conv=2, @@ -236,7 +331,15 @@ def forward(self, x): ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1), ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1), ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2), - ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1), + ("5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1), + ("7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass", conv2d_7x7_1x3x16x16_st2_pd1_dl2), + ("7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass", conv2d_7x7_1x3x15x15_st1_pd0_dl1), + ("5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x14x14_st5_pd0_dl1), + ("5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x9x9_st5_pd0_dl1), + ("3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x9x8_st3_pd0_dl1), + ("3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x8x9_st3_pd0_dl1), + ("3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_3x4_1x3x7x7_st3_pd0_dl1), + ("4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_4x3_1x3x7x7_st3_pd0_dl1), ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1), ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1), ("two_conv2d_nobias", two_conv2d_nobias), From 01d526f8b3897f1ea1ebe6fe6517b59222154f7a Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Wed, 15 Jan 2025 01:25:19 -0800 Subject: [PATCH 08/40] make dim order as default for everywhere in ET Differential Revision: D68167741 Pull Request resolved: https://github.com/pytorch/executorch/pull/7658 --- extension/export_util/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extension/export_util/utils.py b/extension/export_util/utils.py index 66154b95fa..a289355919 100644 --- a/extension/export_util/utils.py +++ b/extension/export_util/utils.py @@ -60,7 +60,6 @@ def _core_aten_to_edge( if not edge_compile_config: edge_compile_config = exir.EdgeCompileConfig( _check_ir_validity=False, # quant ops currently break ir verification - _skip_dim_order=True, # TODO(T182928844): dim order ops can not delegate to backend ) edge_manager: EdgeProgramManager = to_edge( core_aten_exir_ep, From a18f6e89631ab97f6211aaae537bfe2cdd90729e Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 15 Jan 2025 10:58:51 -0500 Subject: [PATCH 09/40] Update torchtune pin (#7670) --- examples/models/llama3_2_vision/install_requirements.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh index a4d789d56e..4d4a6f2862 100755 --- a/examples/models/llama3_2_vision/install_requirements.sh +++ b/examples/models/llama3_2_vision/install_requirements.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -NIGHTLY_VERSION="dev20241112" +NIGHTLY_VERSION="dev20250115" # Install torchtune nightly for model definitions. -pip install --pre torchtune==0.4.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir +pip install --pre torchtune==0.6.0.${NIGHTLY_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir From d596cd78cf2280c9c01adbfc95b54a29865f3fe5 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 15 Jan 2025 13:34:43 -0600 Subject: [PATCH 10/40] [ET-VK][ez] Test specific sizes of linear sizes in generated operator tests (#7672) Pull Request resolved: https://github.com/pytorch/executorch/pull/7667 ## Context Recent changes related to checking SPIR-V capability support at runtime have made it possible to test the 8-bit quantized linear compute shader on Android devices. Previously the test would be automatically skipped since the operator potentially uses 8-bit data types. To make the generated tests more useful, instead test real sizes of linear layer settings found in a sample model in the 8-bit linear test case. ghstack-source-id: 261524380 @exported-using-ghexport Differential Revision: [D68192068](https://our.internmc.facebook.com/intern/diff/D68192068/) Co-authored-by: Stephen Jia --- backends/vulkan/test/op_tests/cases.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index d32fa71573..9cec4891c1 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -169,7 +169,13 @@ def get_linear_test_suites(): @register_test_suite("aten._weight_int8pack_mm.default") def get_weight_int8pack_mm_inputs(): - MKN_list = common_MKN_list + MKN_list = [ + [6, 480, 256], + [6, 256, 1024], + [6, 1024, 256], + [6, 256, 256], + [6, 256, 512], + ] inputs_list = [((M, K), (N, K), (N)) for M, K, N in MKN_list] From 0dbe214712cff2d0eac1d33743ae26aa67d19d04 Mon Sep 17 00:00:00 2001 From: wesleyer Date: Wed, 15 Jan 2025 12:41:57 -0800 Subject: [PATCH 11/40] Fix duplication of classes in modules Differential Revision: D68027619 Pull Request resolved: https://github.com/pytorch/executorch/pull/7606 --- extension/android/BUCK | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/extension/android/BUCK b/extension/android/BUCK index 5d021250e6..040c9258d4 100644 --- a/extension/android/BUCK +++ b/extension/android/BUCK @@ -23,18 +23,13 @@ fb_android_library( fb_android_library( name = "executorch_llama", srcs = [ - "src/main/java/org/pytorch/executorch/DType.java", - "src/main/java/org/pytorch/executorch/EValue.java", "src/main/java/org/pytorch/executorch/LlamaCallback.java", "src/main/java/org/pytorch/executorch/LlamaModule.java", - "src/main/java/org/pytorch/executorch/Module.java", - "src/main/java/org/pytorch/executorch/NativePeer.java", - "src/main/java/org/pytorch/executorch/Tensor.java", - "src/main/java/org/pytorch/executorch/annotations/Experimental.java", ], autoglob = False, language = "JAVA", deps = [ + ":executorch", "//fbandroid/java/com/facebook/jni:jni", "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader", ], From ba6c55211a03be8cca93750f510ff06c22a11a1b Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 15 Jan 2025 16:16:11 -0500 Subject: [PATCH 12/40] Enable bot to cc people (#7680) Summary: The bot will configure based on this tracking issue https://github.com/pytorch/executorch/issues/7679 --- .github/pytorch-probot.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 2b66829ed0..a83087c8d5 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,4 +1,5 @@ # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml +tracking_issue: 7679 ciflow_push_tags: - ciflow/android - ciflow/apple From a727b55b3584c9f6330eae3c4762a2ca500247dc Mon Sep 17 00:00:00 2001 From: cccclai Date: Wed, 15 Jan 2025 13:54:50 -0800 Subject: [PATCH 13/40] fix delegate cache duplicate bug Differential Revision: D67067997 Pull Request resolved: https://github.com/pytorch/executorch/pull/7281 --- exir/_serialize/_program.py | 8 +++-- exir/backend/test/demos/rpc/TARGETS | 1 + exir/emit/_emitter.py | 23 ++++++++---- exir/emit/test/TARGETS | 1 + exir/emit/test/test_emit.py | 56 ++++++++++++++++++++++++++++- 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 80b740674a..7656ea3f36 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -224,6 +224,7 @@ def _extract_delegate_segments( """ remaining_inline: List[BackendDelegateInlineData] = [] inline_indices_seen: set[int] = set() + segment_index_map: dict[bytes, int] = {} for plan in program.execution_plan: for delegate in plan.delegates: if delegate.processed.location != DataLocation.INLINE: @@ -249,8 +250,11 @@ def _extract_delegate_segments( inline_indices_seen.add(delegate.processed.index) if inline.data: # Move the delegate data out of the program. - segment_index = len(segments) - segments.append(Cord(inline.data)) + segment_index = segment_index_map.get(inline.data) + if segment_index is None: + segment_index = len(segments) + segments.append(Cord(inline.data)) + segment_index_map[inline.data] = segment_index delegate.processed = BackendDelegateDataReference( location=DataLocation.SEGMENT, index=segment_index, diff --git a/exir/backend/test/demos/rpc/TARGETS b/exir/backend/test/demos/rpc/TARGETS index a2aadb05ef..63d24ccbda 100644 --- a/exir/backend/test/demos/rpc/TARGETS +++ b/exir/backend/test/demos/rpc/TARGETS @@ -28,6 +28,7 @@ runtime.python_library( ], visibility = [ "//executorch/exir/backend/test/...", + "//executorch/exir/emit/test/...", ], deps = [ ":executor_backend_preprocess", diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py index d08e68fa73..c40a00b240 100644 --- a/exir/emit/_emitter.py +++ b/exir/emit/_emitter.py @@ -122,6 +122,8 @@ class _ProgramState: # Delegate data stored directly in the flatbuffer. Pointed to by BackendDelegateDataReference, # and should be copied to Program.backend_delegate_data. backend_delegate_data: List[BackendDelegateInlineData] = field(default_factory=list) + # Delegate cache that is used across all entry points. Key is the hash of the delegated payload. + backend_delegate_data_cache: Dict[str, int] = field(default_factory=dict) # Constants are optionally stored in external files. # Aggregate unique external constants into one buffer. @@ -144,7 +146,8 @@ class _EmitterState: operators: List[Operator] delegates: List[BackendDelegate] operator_cache: Dict[Tuple[str, str], int] - delegate_cache: Dict[bytes, int] + # delegate_cache: the key is hash(delegated_payload) and the value is the index in delegates + delegate_cache: Dict[str, int] emit_stacktrace: bool spec2id_dict: Dict[TensorSpec, int] = field(default_factory=dict) @@ -1073,8 +1076,8 @@ def _emit_delegate( """Emit the delegates inputs and outputs as specified by the schema, then emit the delegate's blob.""" processed_bytes = lowered_module.processed_bytes - - delegate_index = self.emitter_state.delegate_cache.get(processed_bytes) + hashed = hashlib.sha256(processed_bytes).hexdigest() + delegate_index = self.emitter_state.delegate_cache.get(hashed) delegate_ret = None if isinstance(self.node.meta["spec"], list): @@ -1112,10 +1115,16 @@ def _emit_delegate( if delegate_index is None: # Allocate an entry for the data. TODO(T150113674): Reuse any duplicate entries if # present. - data_index: int = len(self.program_state.backend_delegate_data) - self.program_state.backend_delegate_data.append( - BackendDelegateInlineData(data=processed_bytes) + hashed = hashlib.sha256(processed_bytes).hexdigest() + data_index: Optional[int] = ( + self.program_state.backend_delegate_data_cache.get(hashed) ) + if data_index is None: + data_index = len(self.program_state.backend_delegate_data) + self.program_state.backend_delegate_data_cache[hashed] = data_index + self.program_state.backend_delegate_data.append( + BackendDelegateInlineData(data=processed_bytes) + ) backend_delegate = BackendDelegate( id=lowered_module.backend_id, @@ -1126,7 +1135,7 @@ def _emit_delegate( ) delegate_index = len(self.emitter_state.delegate_cache) self.emitter_state.delegates.append(backend_delegate) - self.emitter_state.delegate_cache[processed_bytes] = delegate_index + self.emitter_state.delegate_cache[hashed] = delegate_index # TODO(angelayi) Will need to emit the kwargs too, in the correct order according to the # function's spec and with default arguments. This requires us to store the function's spec diff --git a/exir/emit/test/TARGETS b/exir/emit/test/TARGETS index 9f416e78ea..153843d45e 100644 --- a/exir/emit/test/TARGETS +++ b/exir/emit/test/TARGETS @@ -16,6 +16,7 @@ python_unittest( "//executorch/exir:lib", "//executorch/exir:print_program", "//executorch/exir:schema", + "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner", "//executorch/exir/backend:backend_api", "//executorch/exir/emit:lib", "//executorch/exir/passes:const_prop_pass", diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py index 0da4085914..3fca3958fe 100644 --- a/exir/emit/test/test_emit.py +++ b/exir/emit/test/test_emit.py @@ -27,6 +27,9 @@ from executorch.exir._serialize._program import deserialize_pte_binary from executorch.exir.backend.backend_api import to_backend from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult +from executorch.exir.backend.test.demos.rpc.executor_backend_partitioner import ( + ExecutorBackendPartitioner, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.emit import emit_program # noqa from executorch.exir.error import InternalError @@ -63,7 +66,7 @@ from functorch.experimental import control_flow from torch import nn -from torch.export import Dim, export +from torch.export import Dim, export, export_for_training class WrapperModule(torch.nn.Module): @@ -1679,3 +1682,54 @@ def forward(self, x): ] self.assertEqual(external_map["linear.weight"], 0) self.assertEqual(external_map["linear.bias"], 1) + + def test_delegate_deduplicate(self) -> None: + class SharedModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(2, 2) + + def forward(self, x): + return self.linear(x) + + class Module1(torch.nn.Module): + def __init__(self, shared_module): + super().__init__() + self.shared_module = shared_module + + def forward(self, x): + return self.shared_module(x) + + class Module2(torch.nn.Module): + def __init__(self, shared_module): + super().__init__() + self.shared_module = shared_module + + def forward(self, x): + return self.shared_module(x) + + shared_module = SharedModule() + module_1 = Module1(shared_module) + module_2 = Module2(shared_module) + example_inputs = (torch.randn(2, 2),) + module_1(*example_inputs) + module_2(*example_inputs) + + ep1 = export_for_training(module_1, example_inputs) + ep2 = export_for_training(module_2, example_inputs) + + edge_program_manager = exir.to_edge( + {"forward1": ep1, "forward2": ep2}, + compile_config=exir.EdgeCompileConfig( + _check_ir_validity=False, _use_edge_ops=True + ), + ) + + edge_program_manager = edge_program_manager.to_backend( + ExecutorBackendPartitioner() + ).to_executorch() + + # Check that there is only one delegate because two methods are exactly the same + self.assertEqual( + len(edge_program_manager.executorch_program.backend_delegate_data), 1 + ) From ee00caa7ed26917a05706e62a5850e6a2454d42c Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Wed, 15 Jan 2025 18:55:24 -0600 Subject: [PATCH 14/40] [xnnpack] Add debug XNNGraph printing (#7617) Prints to a file, with increasing id. TODO: use actual delegate instance id in the filename. Take filepath from compile_spec. --- .../serialization/xnnpack_graph_serialize.py | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/backends/xnnpack/serialization/xnnpack_graph_serialize.py b/backends/xnnpack/serialization/xnnpack_graph_serialize.py index 160c926780..0fbd0ddc5e 100644 --- a/backends/xnnpack/serialization/xnnpack_graph_serialize.py +++ b/backends/xnnpack/serialization/xnnpack_graph_serialize.py @@ -5,11 +5,13 @@ # LICENSE file in the root directory of this source tree. import json + +import logging import os import tempfile from dataclasses import dataclass, fields, is_dataclass -from typing import ClassVar, Literal +from typing import ClassVar, Literal, Optional import pkg_resources from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import XNNGraph @@ -17,6 +19,9 @@ from executorch.exir._serialize._flatbuffer import _flatc_compile +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + # Byte order of numbers written to program headers. Always little-endian # regardless of the host system, since all commonly-used modern CPUs are little # endian. @@ -273,19 +278,42 @@ def _pad_to(data: bytes, length: int) -> bytes: return data -def pretty_print_xnngraph(xnnpack_graph_json: str): +def pretty_print_xnngraph(xnnpack_graph_json: str, filename: Optional[str] = None): """ - Pretty print the XNNGraph + Pretty print the XNNGraph, optionally writing to a file if filename is provided """ - from pprint import pprint + from pprint import pformat d = json.loads(xnnpack_graph_json) - pprint(d) + pstr = pformat(d, indent=2, compact=True).replace("'", '"') + if filename: + with open(filename, "w") as f: + if filename.endswith(".json"): + pstr = pstr.replace("None", "null") + f.write(pstr) + else: # dump to stdout + print("XNNGraph:") + print(pstr) + print("End of XNNGraph") + + +# TODO: Replace this with an actual delegate id +_delegate_instance_id = 0 def convert_to_flatbuffer(xnnpack_graph: XNNGraph) -> bytes: + global _delegate_instance_id sanity_check_xnngraph_dataclass(xnnpack_graph) xnnpack_graph_json = json.dumps(xnnpack_graph, cls=_DataclassEncoder) + + # Log the XNNGraph if debugging + if logger.getEffectiveLevel() == logging.DEBUG: + filename: str = f"./xnnpack_delegate_graph_{_delegate_instance_id}.json" + logger.debug(f"Writing XNNGraph to {filename}") + pretty_print_xnngraph(xnnpack_graph_json, filename) + + _delegate_instance_id += 1 + with tempfile.TemporaryDirectory() as d: schema_path = os.path.join(d, "schema.fbs") with open(schema_path, "wb") as schema_file: From 6aa5c8a1a473197f403c0b7a8804c39f4126face Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Wed, 15 Jan 2025 23:40:52 -0800 Subject: [PATCH 15/40] Add uint16 to supported dtypes and regenerate edge.yaml Differential Revision: D68241997 Pull Request resolved: https://github.com/pytorch/executorch/pull/7687 --- exir/dialects/edge/dtype/runner.py | 4 +- exir/dialects/edge/edge.yaml | 1308 +++++++++++++-------- exir/dialects/edge/test/test_edge_yaml.py | 1 + 3 files changed, 832 insertions(+), 481 deletions(-) diff --git a/exir/dialects/edge/dtype/runner.py b/exir/dialects/edge/dtype/runner.py index ef488433fb..67982a164e 100644 --- a/exir/dialects/edge/dtype/runner.py +++ b/exir/dialects/edge/dtype/runner.py @@ -17,7 +17,9 @@ class DtypeRunner: def __init__(self): - self.tensor_dtypes = list(common_dtype.all_types_and(torch.bool, torch.half)) + self.tensor_dtypes = list( + common_dtype.all_types_and(torch.bool, torch.half, torch.uint16) + ) self.scalar_dtypes = [torch.bool, torch.int, torch.float] @staticmethod diff --git a/exir/dialects/edge/edge.yaml b/exir/dialects/edge/edge.yaml index 3d052fc944..039490a839 100644 --- a/exir/dialects/edge/edge.yaml +++ b/exir/dialects/edge/edge.yaml @@ -26,7 +26,7 @@ inherits: aten::_to_copy type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] T3: [Char] T4: [Double] @@ -35,6 +35,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T1 dtype: T0 @@ -63,6 +64,9 @@ - self: T1 dtype: T9 __ret_0: T9 + - self: T1 + dtype: T10 + __ret_0: T10 - func: aten::abs namespace: edge @@ -77,7 +81,7 @@ namespace: edge inherits: aten::acos type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -90,7 +94,7 @@ namespace: edge inherits: aten::acosh type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -104,7 +108,7 @@ inherits: aten::add.Scalar type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Float, Int, Long, Short] + T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Bool, Int] T4: [Bool, Long] @@ -117,6 +121,7 @@ T11: [Int] T12: [Long] T13: [Short] + T14: [UInt16] type_constraint: - self: T0 other: T0 @@ -238,6 +243,10 @@ other: T8 alpha: T9 __ret_0: T8 + - self: T14 + other: T8 + alpha: T9 + __ret_0: T8 - func: aten::add.Tensor namespace: edge @@ -245,9 +254,9 @@ type_alias: T0: [Bool] T1: [Bool, Byte] - T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T4: [Bool, Byte, Char, Half, Int, Long, Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T5: [Bool, Byte, Char, Int, Long, Short] T6: [Bool, Byte, Char, Int, Short] T7: [Bool, Byte, Char, Short] @@ -264,6 +273,7 @@ T18: [Int] T19: [Long] T20: [Short] + T21: [UInt16] type_constraint: - self: T0 other: T0 @@ -417,6 +427,10 @@ other: T20 alpha: T16 __ret_0: T14 + - self: T14 + other: T21 + alpha: T16 + __ret_0: T14 - self: T15 other: T0 alpha: T16 @@ -461,6 +475,10 @@ other: T20 alpha: T16 __ret_0: T15 + - self: T15 + other: T21 + alpha: T16 + __ret_0: T15 - self: T17 other: T0 alpha: T16 @@ -505,6 +523,10 @@ other: T20 alpha: T16 __ret_0: T17 + - self: T17 + other: T21 + alpha: T16 + __ret_0: T17 - self: T18 other: T6 alpha: T18 @@ -553,6 +575,18 @@ other: T17 alpha: T16 __ret_0: T17 + - self: T21 + other: T14 + alpha: T16 + __ret_0: T14 + - self: T21 + other: T15 + alpha: T16 + __ret_0: T15 + - self: T21 + other: T17 + alpha: T16 + __ret_0: T17 - func: aten::addmm namespace: edge @@ -862,7 +896,7 @@ namespace: edge inherits: aten::alias_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -890,7 +924,7 @@ inherits: aten::any type_alias: T0: [Bool] - T1: [Bool, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] type_constraint: - self: T1 @@ -2057,7 +2091,7 @@ namespace: edge inherits: aten::as_strided_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -2066,7 +2100,7 @@ namespace: edge inherits: aten::asin type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -2079,7 +2113,7 @@ namespace: edge inherits: aten::asinh type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -2092,7 +2126,7 @@ namespace: edge inherits: aten::atan type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -2105,7 +2139,7 @@ namespace: edge inherits: aten::atanh type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -2135,6 +2169,7 @@ T5: [Int] T6: [Long] T7: [Short] + T8: [UInt16] type_constraint: - self: T0 other: T0 @@ -2157,17 +2192,20 @@ - self: T7 other: T1 __ret_0: T7 + - self: T8 + other: T1 + __ret_0: T8 - func: aten::bitwise_and.Tensor namespace: edge inherits: aten::bitwise_and.Tensor type_alias: - T0: [Bool] - T1: [Bool, Byte] - T2: [Bool, Byte, Char, Int, Long, Short] - T3: [Bool, Byte, Char, Int, Short] - T4: [Bool, Byte, Char, Short] - T5: [Bool, Char] + T0: [Bool, Byte] + T1: [Bool, Byte, Char, Int, Long, Short] + T2: [Bool, Byte, Char, Int, Short] + T3: [Bool, Byte, Char, Short] + T4: [Bool, Char] + T5: [Bool, UInt16] T6: [Byte] T7: [Byte, Short] T8: [Char] @@ -2177,25 +2215,25 @@ T12: [Short] type_constraint: - self: T0 - other: T0 - __ret_0: T0 - - self: T1 other: T6 __ret_0: T6 - - self: T2 + - self: T1 other: T11 __ret_0: T11 - - self: T3 + - self: T2 other: T10 __ret_0: T10 - - self: T4 + - self: T3 other: T12 __ret_0: T12 - - self: T5 + - self: T4 other: T8 __ret_0: T8 + - self: T5 + other: T5 + __ret_0: T5 - self: T6 - other: T1 + other: T0 __ret_0: T6 - self: T6 other: T9 @@ -2204,7 +2242,7 @@ other: T8 __ret_0: T12 - self: T8 - other: T5 + other: T4 __ret_0: T8 - self: T8 other: T7 @@ -2213,13 +2251,13 @@ other: T6 __ret_0: T12 - self: T10 - other: T3 + other: T2 __ret_0: T10 - self: T11 - other: T2 + other: T1 __ret_0: T11 - self: T12 - other: T4 + other: T3 __ret_0: T12 - func: aten::bitwise_not @@ -2243,6 +2281,7 @@ T5: [Int] T6: [Long] T7: [Short] + T8: [UInt16] type_constraint: - self: T0 other: T0 @@ -2265,17 +2304,20 @@ - self: T7 other: T1 __ret_0: T7 + - self: T8 + other: T1 + __ret_0: T8 - func: aten::bitwise_or.Tensor namespace: edge inherits: aten::bitwise_or.Tensor type_alias: - T0: [Bool] - T1: [Bool, Byte] - T2: [Bool, Byte, Char, Int, Long, Short] - T3: [Bool, Byte, Char, Int, Short] - T4: [Bool, Byte, Char, Short] - T5: [Bool, Char] + T0: [Bool, Byte] + T1: [Bool, Byte, Char, Int, Long, Short] + T2: [Bool, Byte, Char, Int, Short] + T3: [Bool, Byte, Char, Short] + T4: [Bool, Char] + T5: [Bool, UInt16] T6: [Byte] T7: [Byte, Short] T8: [Char] @@ -2285,25 +2327,25 @@ T12: [Short] type_constraint: - self: T0 - other: T0 - __ret_0: T0 - - self: T1 other: T6 __ret_0: T6 - - self: T2 + - self: T1 other: T11 __ret_0: T11 - - self: T3 + - self: T2 other: T10 __ret_0: T10 - - self: T4 + - self: T3 other: T12 __ret_0: T12 - - self: T5 + - self: T4 other: T8 __ret_0: T8 + - self: T5 + other: T5 + __ret_0: T5 - self: T6 - other: T1 + other: T0 __ret_0: T6 - self: T6 other: T9 @@ -2312,7 +2354,7 @@ other: T8 __ret_0: T12 - self: T8 - other: T5 + other: T4 __ret_0: T8 - self: T8 other: T7 @@ -2321,13 +2363,13 @@ other: T6 __ret_0: T12 - self: T10 - other: T3 + other: T2 __ret_0: T10 - self: T11 - other: T2 + other: T1 __ret_0: T11 - self: T12 - other: T4 + other: T3 __ret_0: T12 - func: aten::bitwise_xor.Scalar @@ -2342,6 +2384,7 @@ T5: [Int] T6: [Long] T7: [Short] + T8: [UInt16] type_constraint: - self: T0 other: T0 @@ -2364,17 +2407,20 @@ - self: T7 other: T1 __ret_0: T7 + - self: T8 + other: T1 + __ret_0: T8 - func: aten::bitwise_xor.Tensor namespace: edge inherits: aten::bitwise_xor.Tensor type_alias: - T0: [Bool] - T1: [Bool, Byte] - T2: [Bool, Byte, Char, Int, Long, Short] - T3: [Bool, Byte, Char, Int, Short] - T4: [Bool, Byte, Char, Short] - T5: [Bool, Char] + T0: [Bool, Byte] + T1: [Bool, Byte, Char, Int, Long, Short] + T2: [Bool, Byte, Char, Int, Short] + T3: [Bool, Byte, Char, Short] + T4: [Bool, Char] + T5: [Bool, UInt16] T6: [Byte] T7: [Byte, Short] T8: [Char] @@ -2384,25 +2430,25 @@ T12: [Short] type_constraint: - self: T0 - other: T0 - __ret_0: T0 - - self: T1 other: T6 __ret_0: T6 - - self: T2 + - self: T1 other: T11 __ret_0: T11 - - self: T3 + - self: T2 other: T10 __ret_0: T10 - - self: T4 + - self: T3 other: T12 __ret_0: T12 - - self: T5 + - self: T4 other: T8 __ret_0: T8 + - self: T5 + other: T5 + __ret_0: T5 - self: T6 - other: T1 + other: T0 __ret_0: T6 - self: T6 other: T9 @@ -2411,7 +2457,7 @@ other: T8 __ret_0: T12 - self: T8 - other: T5 + other: T4 __ret_0: T8 - self: T8 other: T7 @@ -2420,13 +2466,13 @@ other: T6 __ret_0: T12 - self: T10 - other: T3 + other: T2 __ret_0: T10 - self: T11 - other: T2 + other: T1 __ret_0: T11 - self: T12 - other: T4 + other: T3 __ret_0: T12 - func: aten::bmm @@ -2443,7 +2489,7 @@ namespace: edge inherits: aten::cat type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - tensors: T0 __ret_0: T0 @@ -2452,7 +2498,7 @@ namespace: edge inherits: aten::ceil type_alias: - T0: [Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -2462,7 +2508,7 @@ inherits: aten::clamp type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Float, Int, Long, Short] + T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Bool, Int] T4: [Bool, Long] @@ -2474,6 +2520,7 @@ T10: [Int] T11: [Long] T12: [Short] + T13: [UInt16] type_constraint: - self: T0 min: T2 @@ -2715,12 +2762,20 @@ min: T10 max: T3 __ret_0: T12 + - self: T13 + min: T2 + max: T8 + __ret_0: T8 + - self: T13 + min: T8 + max: T2 + __ret_0: T8 - func: aten::clone namespace: edge inherits: aten::clone type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -2739,6 +2794,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 value: T1 @@ -2767,12 +2823,15 @@ - self: T9 value: T1 __ret_0: T9 + - self: T10 + value: T1 + __ret_0: T10 - func: aten::convolution namespace: edge inherits: aten::convolution type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T1: [Byte] T2: [Char] T3: [Double] @@ -2820,7 +2879,7 @@ inherits: aten::copy type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] T3: [Char] T4: [Double] @@ -2829,6 +2888,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 src: T1 @@ -2857,12 +2917,15 @@ - self: T9 src: T1 __ret_0: T9 + - self: T10 + src: T1 + __ret_0: T10 - func: aten::cos namespace: edge inherits: aten::cos type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -2875,7 +2938,7 @@ namespace: edge inherits: aten::cosh type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -2888,7 +2951,7 @@ namespace: edge inherits: aten::cumsum type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T1: [Byte] T2: [Char] T3: [Double] @@ -2927,7 +2990,7 @@ namespace: edge inherits: aten::detach_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -2937,7 +3000,7 @@ inherits: aten::div.Scalar type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Float, Int, Long, Short] + T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Byte] T4: [Char] @@ -2947,6 +3010,7 @@ T8: [Int] T9: [Long] T10: [Short] + T11: [UInt16] type_constraint: - self: T0 other: T2 @@ -2984,24 +3048,29 @@ - self: T10 other: T2 __ret_0: T6 + - self: T11 + other: T2 + __ret_0: T6 - func: aten::div.Tensor namespace: edge inherits: aten::div.Tensor type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Byte, Char, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] T3: [Bool, Byte, Char, Float, Int, Long, Short] - T4: [Bool, Byte, Char, Half, Int, Long, Short] + T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T5: [Byte] T6: [Char] T7: [Double] T8: [Float] - T9: [Half] - T10: [Int] - T11: [Long] - T12: [Short] + T9: [Float, UInt16] + T10: [Half] + T11: [Int] + T12: [Long] + T13: [Short] + T14: [UInt16] type_constraint: - self: T0 other: T3 @@ -3021,18 +3090,18 @@ - self: T3 other: T6 __ret_0: T8 - - self: T3 - other: T10 - __ret_0: T8 - self: T3 other: T11 __ret_0: T8 - self: T3 other: T12 __ret_0: T8 + - self: T3 + other: T13 + __ret_0: T8 - self: T4 - other: T9 - __ret_0: T9 + other: T10 + __ret_0: T10 - self: T5 other: T3 __ret_0: T8 @@ -3046,26 +3115,32 @@ other: T2 __ret_0: T8 - self: T9 - other: T4 - __ret_0: T9 - - self: T10 - other: T3 + other: T14 __ret_0: T8 + - self: T10 + other: T4 + __ret_0: T10 - self: T11 other: T3 __ret_0: T8 - self: T12 other: T3 __ret_0: T8 + - self: T13 + other: T3 + __ret_0: T8 + - self: T14 + other: T9 + __ret_0: T8 - func: aten::div.Tensor_mode namespace: edge inherits: aten::div.Tensor_mode type_alias: T0: [Bool, Byte] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T4: [Bool, Byte, Char, Int, Long, Short] T5: [Bool, Byte, Char, Int, Short] T6: [Bool, Byte, Char, Short] @@ -3156,6 +3231,7 @@ T7: [Int, Long] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - weight: T0 indices: T7 @@ -3184,12 +3260,15 @@ - weight: T9 indices: T7 __ret_0: T9 + - weight: T10 + indices: T7 + __ret_0: T10 - func: aten::empty.memory_format namespace: edge inherits: aten::empty.memory_format type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - dtype: T0 __ret_0: T0 @@ -3199,7 +3278,7 @@ inherits: aten::eq.Scalar type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Byte] T4: [Char] @@ -3209,6 +3288,7 @@ T8: [Int] T9: [Long] T10: [Short] + T11: [UInt16] type_constraint: - self: T0 other: T2 @@ -3246,12 +3326,15 @@ - self: T10 other: T2 __ret_0: T0 + - self: T11 + other: T2 + __ret_0: T0 - func: aten::erf namespace: edge inherits: aten::erf type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -3264,7 +3347,7 @@ namespace: edge inherits: aten::exp type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -3277,7 +3360,7 @@ namespace: edge inherits: aten::expand_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -3296,6 +3379,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 value: T1 @@ -3324,13 +3408,16 @@ - self: T9 value: T1 __ret_0: T9 + - self: T10 + value: T1 + __ret_0: T10 - func: aten::fill.Tensor namespace: edge inherits: aten::fill.Tensor type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] T3: [Char] T4: [Double] @@ -3339,6 +3426,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 value: T1 @@ -3367,12 +3455,15 @@ - self: T9 value: T1 __ret_0: T9 + - self: T10 + value: T1 + __ret_0: T10 - func: aten::floor namespace: edge inherits: aten::floor type_alias: - T0: [Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -3382,9 +3473,9 @@ inherits: aten::floor_divide type_alias: T0: [Bool, Byte] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T4: [Bool, Byte, Char, Int, Long, Short] T5: [Bool, Byte, Char, Int, Short] T6: [Bool, Byte, Char, Short] @@ -3465,7 +3556,7 @@ namespace: edge inherits: aten::fmod.Scalar type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Bool, Float, Int] T2: [Bool, Int] T3: [Bool, Long] @@ -3514,9 +3605,9 @@ inherits: aten::fmod.Tensor type_alias: T0: [Bool, Byte] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T4: [Bool, Byte, Char, Int, Long, Short] T5: [Bool, Byte, Char, Int, Short] T6: [Bool, Byte, Char, Short] @@ -3607,6 +3698,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - fill_value: T1 dtype: T0 @@ -3635,13 +3727,16 @@ - fill_value: T1 dtype: T9 __ret_0: T9 + - fill_value: T1 + dtype: T10 + __ret_0: T10 - func: aten::full_like namespace: edge inherits: aten::full_like type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Byte] T4: [Char] @@ -3651,6 +3746,7 @@ T8: [Int] T9: [Long] T10: [Short] + T11: [UInt16] type_constraint: - self: T0 fill_value: T2 @@ -3688,6 +3784,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T0 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T1 fill_value: T0 dtype: T0 @@ -3724,6 +3824,10 @@ fill_value: T0 dtype: T10 __ret_0: T10 + - self: T1 + fill_value: T0 + dtype: T11 + __ret_0: T11 - self: T1 fill_value: T6 dtype: T0 @@ -3760,6 +3864,10 @@ fill_value: T6 dtype: T10 __ret_0: T10 + - self: T1 + fill_value: T6 + dtype: T11 + __ret_0: T11 - self: T1 fill_value: T8 dtype: T0 @@ -3796,6 +3904,10 @@ fill_value: T8 dtype: T10 __ret_0: T10 + - self: T1 + fill_value: T8 + dtype: T11 + __ret_0: T11 - self: T3 fill_value: T2 dtype: T0 @@ -3832,6 +3944,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T3 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T4 fill_value: T2 dtype: T0 @@ -3868,6 +3984,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T4 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T5 fill_value: T2 dtype: T0 @@ -3904,6 +4024,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T5 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T6 fill_value: T2 dtype: T0 @@ -3940,6 +4064,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T6 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T7 fill_value: T2 dtype: T0 @@ -3976,6 +4104,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T7 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T8 fill_value: T2 dtype: T0 @@ -4012,6 +4144,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T8 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T9 fill_value: T2 dtype: T0 @@ -4048,6 +4184,10 @@ fill_value: T2 dtype: T10 __ret_0: T10 + - self: T9 + fill_value: T2 + dtype: T11 + __ret_0: T11 - self: T10 fill_value: T2 dtype: T0 @@ -4084,58 +4224,103 @@ fill_value: T2 dtype: T10 __ret_0: T10 - -- func: aten::ge.Scalar - namespace: edge - inherits: aten::ge.Scalar - type_alias: - T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Float, Int] - T3: [Byte] - T4: [Char] - T5: [Double] - T6: [Float] - T7: [Half] - T8: [Int] - T9: [Long] - T10: [Short] - type_constraint: - - self: T0 - other: T2 + - self: T10 + fill_value: T2 + dtype: T11 + __ret_0: T11 + - self: T11 + fill_value: T2 + dtype: T0 __ret_0: T0 - - self: T1 - other: T0 + - self: T11 + fill_value: T2 + dtype: T3 + __ret_0: T3 + - self: T11 + fill_value: T2 + dtype: T4 + __ret_0: T4 + - self: T11 + fill_value: T2 + dtype: T5 + __ret_0: T5 + - self: T11 + fill_value: T2 + dtype: T6 + __ret_0: T6 + - self: T11 + fill_value: T2 + dtype: T7 + __ret_0: T7 + - self: T11 + fill_value: T2 + dtype: T8 + __ret_0: T8 + - self: T11 + fill_value: T2 + dtype: T9 + __ret_0: T9 + - self: T11 + fill_value: T2 + dtype: T10 + __ret_0: T10 + - self: T11 + fill_value: T2 + dtype: T11 + __ret_0: T11 + +- func: aten::ge.Scalar + namespace: edge + inherits: aten::ge.Scalar + type_alias: + T0: [Bool] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Float, Int] + T4: [Byte] + T5: [Char] + T6: [Double] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + type_constraint: + - self: T0 + other: T3 __ret_0: T0 - self: T1 - other: T6 + other: T0 __ret_0: T0 - self: T1 - other: T8 + other: T9 __ret_0: T0 - - self: T3 - other: T2 + - self: T2 + other: T7 __ret_0: T0 - self: T4 - other: T2 + other: T3 __ret_0: T0 - self: T5 - other: T2 + other: T3 __ret_0: T0 - self: T6 - other: T2 + other: T3 __ret_0: T0 - self: T7 - other: T2 + other: T3 __ret_0: T0 - self: T8 - other: T2 + other: T3 __ret_0: T0 - self: T9 - other: T2 + other: T3 __ret_0: T0 - self: T10 - other: T2 + other: T3 + __ret_0: T0 + - self: T11 + other: T3 __ret_0: T0 - func: aten::ge.Tensor @@ -4144,14 +4329,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -4159,9 +4347,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -4169,22 +4354,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -4193,20 +4378,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::gelu namespace: edge @@ -4232,51 +4426,52 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Float, Int] - T3: [Byte] - T4: [Char] - T5: [Double] - T6: [Float] - T7: [Half] - T8: [Int] - T9: [Long] - T10: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Float, Int] + T4: [Byte] + T5: [Char] + T6: [Double] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] type_constraint: - self: T0 - other: T2 + other: T3 __ret_0: T0 - self: T1 other: T0 __ret_0: T0 - self: T1 - other: T6 - __ret_0: T0 - - self: T1 - other: T8 + other: T9 __ret_0: T0 - - self: T3 - other: T2 + - self: T2 + other: T7 __ret_0: T0 - self: T4 - other: T2 + other: T3 __ret_0: T0 - self: T5 - other: T2 + other: T3 __ret_0: T0 - self: T6 - other: T2 + other: T3 __ret_0: T0 - self: T7 - other: T2 + other: T3 __ret_0: T0 - self: T8 - other: T2 + other: T3 __ret_0: T0 - self: T9 - other: T2 + other: T3 __ret_0: T0 - self: T10 - other: T2 + other: T3 + __ret_0: T0 + - self: T11 + other: T3 __ret_0: T0 - func: aten::gt.Tensor @@ -4285,14 +4480,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -4300,9 +4498,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -4310,22 +4505,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -4334,20 +4529,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::hardtanh namespace: edge @@ -4666,6 +4870,7 @@ T7: [Int, Long] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 index: T7 @@ -4694,13 +4899,16 @@ - self: T9 index: T7 __ret_0: T9 + - self: T10 + index: T7 + __ret_0: T10 - func: aten::isinf namespace: edge inherits: aten::isinf type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T1 __ret_0: T0 @@ -4710,7 +4918,7 @@ inherits: aten::isnan type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T1 __ret_0: T0 @@ -4721,51 +4929,52 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Float, Int] - T3: [Byte] - T4: [Char] - T5: [Double] - T6: [Float] - T7: [Half] - T8: [Int] - T9: [Long] - T10: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Float, Int] + T4: [Byte] + T5: [Char] + T6: [Double] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] type_constraint: - self: T0 - other: T2 + other: T3 __ret_0: T0 - self: T1 other: T0 __ret_0: T0 - self: T1 - other: T6 + other: T9 __ret_0: T0 - - self: T1 - other: T8 - __ret_0: T0 - - self: T3 - other: T2 + - self: T2 + other: T7 __ret_0: T0 - self: T4 - other: T2 + other: T3 __ret_0: T0 - self: T5 - other: T2 + other: T3 __ret_0: T0 - self: T6 - other: T2 + other: T3 __ret_0: T0 - self: T7 - other: T2 + other: T3 __ret_0: T0 - self: T8 - other: T2 + other: T3 __ret_0: T0 - self: T9 - other: T2 + other: T3 __ret_0: T0 - self: T10 - other: T2 + other: T3 + __ret_0: T0 + - self: T11 + other: T3 __ret_0: T0 - func: aten::le.Tensor @@ -4774,14 +4983,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -4789,9 +5001,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -4799,22 +5008,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -4823,20 +5032,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::leaky_relu namespace: edge @@ -4861,7 +5079,7 @@ namespace: edge inherits: aten::lift_fresh_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -4870,7 +5088,7 @@ namespace: edge inherits: aten::log type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -4885,14 +5103,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -4900,9 +5121,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -4910,22 +5128,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -4934,20 +5152,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::logical_not namespace: edge @@ -4965,14 +5192,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -4980,9 +5210,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -4990,22 +5217,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -5014,20 +5241,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::logical_xor namespace: edge @@ -5035,14 +5271,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -5050,9 +5289,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -5060,22 +5296,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -5084,26 +5320,35 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::logit namespace: edge inherits: aten::logit type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -5118,51 +5363,52 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Float, Int] - T3: [Byte] - T4: [Char] - T5: [Double] - T6: [Float] - T7: [Half] - T8: [Int] - T9: [Long] - T10: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Float, Int] + T4: [Byte] + T5: [Char] + T6: [Double] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] type_constraint: - self: T0 - other: T2 + other: T3 __ret_0: T0 - self: T1 other: T0 __ret_0: T0 - self: T1 - other: T6 - __ret_0: T0 - - self: T1 - other: T8 + other: T9 __ret_0: T0 - - self: T3 - other: T2 + - self: T2 + other: T7 __ret_0: T0 - self: T4 - other: T2 + other: T3 __ret_0: T0 - self: T5 - other: T2 + other: T3 __ret_0: T0 - self: T6 - other: T2 + other: T3 __ret_0: T0 - self: T7 - other: T2 + other: T3 __ret_0: T0 - self: T8 - other: T2 + other: T3 __ret_0: T0 - self: T9 - other: T2 + other: T3 __ret_0: T0 - self: T10 - other: T2 + other: T3 + __ret_0: T0 + - self: T11 + other: T3 __ret_0: T0 - func: aten::lt.Tensor @@ -5171,14 +5417,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -5186,9 +5435,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -5196,22 +5442,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -5220,20 +5466,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::masked_fill.Scalar namespace: edge @@ -5371,7 +5626,7 @@ namespace: edge inherits: aten::mean.dim type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T1: [Double] T2: [Float] T3: [Half] @@ -5434,9 +5689,9 @@ type_alias: T0: [Bool] T1: [Bool, Byte] - T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T4: [Bool, Byte, Char, Half, Int, Long, Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T5: [Bool, Byte, Char, Int, Long, Short] T6: [Bool, Byte, Char, Int, Short] T7: [Bool, Byte, Char, Short] @@ -5531,7 +5786,7 @@ inherits: aten::mul.Scalar type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Float, Int, Long, Short] + T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Bool, Int] T4: [Bool, Long] @@ -5543,6 +5798,7 @@ T10: [Int] T11: [Long] T12: [Short] + T13: [UInt16] type_constraint: - self: T0 other: T0 @@ -5577,20 +5833,23 @@ - self: T12 other: T3 __ret_0: T12 + - self: T13 + other: T3 + __ret_0: T13 - func: aten::mul.Tensor namespace: edge inherits: aten::mul.Tensor type_alias: - T0: [Bool] - T1: [Bool, Byte] - T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T4: [Bool, Byte, Char, Half, Int, Long, Short] - T5: [Bool, Byte, Char, Int, Long, Short] - T6: [Bool, Byte, Char, Int, Short] - T7: [Bool, Byte, Char, Short] - T8: [Bool, Char] + T0: [Bool, Byte] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] + T4: [Bool, Byte, Char, Int, Long, Short] + T5: [Bool, Byte, Char, Int, Short] + T6: [Bool, Byte, Char, Short] + T7: [Bool, Char] + T8: [Bool, UInt16] T9: [Byte] T10: [Byte, Short] T11: [Char] @@ -5603,34 +5862,34 @@ T18: [Short] type_constraint: - self: T0 - other: T0 - __ret_0: T0 - - self: T1 other: T9 __ret_0: T9 - - self: T2 + - self: T1 other: T13 __ret_0: T13 - - self: T3 + - self: T2 other: T14 __ret_0: T14 - - self: T4 + - self: T3 other: T15 __ret_0: T15 - - self: T5 + - self: T4 other: T17 __ret_0: T17 - - self: T6 + - self: T5 other: T16 __ret_0: T16 - - self: T7 + - self: T6 other: T18 __ret_0: T18 - - self: T8 + - self: T7 other: T11 __ret_0: T11 + - self: T8 + other: T8 + __ret_0: T8 - self: T9 - other: T1 + other: T0 __ret_0: T9 - self: T9 other: T12 @@ -5639,7 +5898,7 @@ other: T11 __ret_0: T18 - self: T11 - other: T8 + other: T7 __ret_0: T11 - self: T11 other: T10 @@ -5648,22 +5907,22 @@ other: T9 __ret_0: T18 - self: T13 - other: T2 + other: T1 __ret_0: T13 - self: T14 - other: T3 + other: T2 __ret_0: T14 - self: T15 - other: T4 + other: T3 __ret_0: T15 - self: T16 - other: T6 + other: T5 __ret_0: T16 - self: T17 - other: T5 + other: T4 __ret_0: T17 - self: T18 - other: T7 + other: T6 __ret_0: T18 - func: aten::native_layer_norm @@ -5684,7 +5943,7 @@ inherits: aten::ne.Scalar type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Byte] T4: [Char] @@ -5694,6 +5953,7 @@ T8: [Int] T9: [Long] T10: [Short] + T11: [UInt16] type_constraint: - self: T0 other: T2 @@ -5731,6 +5991,9 @@ - self: T10 other: T2 __ret_0: T0 + - self: T11 + other: T2 + __ret_0: T0 - func: aten::ne.Tensor namespace: edge @@ -5738,14 +6001,17 @@ type_alias: T0: [Bool] T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte] - T3: [Char] - T4: [Double] - T5: [Float] - T6: [Half] - T7: [Int] - T8: [Long] - T9: [Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Byte] + T4: [Char] + T5: [Double] + T6: [Double, Float, Half, UInt16] + T7: [Float] + T8: [Half] + T9: [Int] + T10: [Long] + T11: [Short] + T12: [UInt16] type_constraint: - self: T0 other: T1 @@ -5753,9 +6019,6 @@ - self: T1 other: T0 __ret_0: T0 - - self: T1 - other: T2 - __ret_0: T0 - self: T1 other: T3 __ret_0: T0 @@ -5763,22 +6026,22 @@ other: T4 __ret_0: T0 - self: T1 - other: T5 + other: T9 __ret_0: T0 - self: T1 - other: T6 + other: T10 __ret_0: T0 - self: T1 - other: T7 + other: T11 __ret_0: T0 - - self: T1 - other: T8 + - self: T2 + other: T5 __ret_0: T0 - - self: T1 - other: T9 + - self: T2 + other: T7 __ret_0: T0 - self: T2 - other: T1 + other: T8 __ret_0: T0 - self: T3 other: T1 @@ -5787,20 +6050,29 @@ other: T1 __ret_0: T0 - self: T5 - other: T1 + other: T2 __ret_0: T0 - self: T6 - other: T1 + other: T12 __ret_0: T0 - self: T7 - other: T1 + other: T2 __ret_0: T0 - self: T8 - other: T1 + other: T2 __ret_0: T0 - self: T9 other: T1 __ret_0: T0 + - self: T10 + other: T1 + __ret_0: T0 + - self: T11 + other: T1 + __ret_0: T0 + - self: T12 + other: T6 + __ret_0: T0 - func: aten::neg namespace: edge @@ -5825,7 +6097,7 @@ namespace: edge inherits: aten::ones type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - dtype: T0 __ret_0: T0 @@ -5834,7 +6106,7 @@ namespace: edge inherits: aten::permute_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -5853,7 +6125,7 @@ inherits: aten::pow.Tensor_Scalar type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Float, Int, Long, Short] + T1: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T2: [Bool, Float, Int] T3: [Bool, Int] T4: [Bool, Long] @@ -5865,6 +6137,7 @@ T10: [Int] T11: [Long] T12: [Short] + T13: [UInt16] type_constraint: - self: T0 exponent: T0 @@ -5899,15 +6172,18 @@ - self: T12 exponent: T3 __ret_0: T12 + - self: T13 + exponent: T0 + __ret_0: T13 - func: aten::pow.Tensor_Tensor namespace: edge inherits: aten::pow.Tensor_Tensor type_alias: T0: [Bool, Byte] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T4: [Bool, Byte, Char, Int, Long, Short] T5: [Bool, Byte, Char, Int, Short] T6: [Bool, Byte, Char, Short] @@ -5988,7 +6264,7 @@ namespace: edge inherits: aten::reciprocal type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -6010,7 +6286,7 @@ namespace: edge inherits: aten::remainder.Scalar type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Bool, Float, Int] T2: [Bool, Int] T3: [Bool, Long] @@ -6059,9 +6335,9 @@ inherits: aten::remainder.Tensor type_alias: T0: [Bool, Byte] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T4: [Bool, Byte, Char, Int, Long, Short] T5: [Bool, Byte, Char, Int, Short] T6: [Bool, Byte, Char, Short] @@ -6142,7 +6418,7 @@ namespace: edge inherits: aten::repeat type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6151,7 +6427,7 @@ namespace: edge inherits: aten::round type_alias: - T0: [Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6160,7 +6436,7 @@ namespace: edge inherits: aten::rsqrt type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -6174,7 +6450,7 @@ inherits: aten::rsub.Scalar type_alias: T0: [Byte] - T1: [Byte, Char, Float, Int, Long, Short] + T1: [Byte, Char, Float, Int, Long, Short, UInt16] T2: [Char] T3: [Double] T4: [Float] @@ -6183,6 +6459,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 other: T4 @@ -6280,6 +6557,10 @@ other: T7 alpha: T7 __ret_0: T9 + - self: T10 + other: T4 + alpha: T5 + __ret_0: T4 - func: aten::scalar_tensor namespace: edge @@ -6295,6 +6576,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - s: T1 dtype: T0 @@ -6323,6 +6605,9 @@ - s: T1 dtype: T9 __ret_0: T9 + - s: T1 + dtype: T10 + __ret_0: T10 - func: aten::scatter_add namespace: edge @@ -6379,7 +6664,7 @@ namespace: edge inherits: aten::select_copy.int type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6389,7 +6674,7 @@ inherits: aten::select_scatter type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] T3: [Char] T4: [Double] @@ -6398,6 +6683,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 src: T1 @@ -6426,12 +6712,15 @@ - self: T9 src: T1 __ret_0: T9 + - self: T10 + src: T1 + __ret_0: T10 - func: aten::sigmoid namespace: edge inherits: aten::sigmoid type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -6453,7 +6742,7 @@ namespace: edge inherits: aten::sin type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -6466,7 +6755,7 @@ namespace: edge inherits: aten::sinh type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -6479,7 +6768,7 @@ namespace: edge inherits: aten::slice_copy.Tensor type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6489,7 +6778,7 @@ inherits: aten::slice_scatter type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] T3: [Char] T4: [Double] @@ -6498,6 +6787,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 src: T1 @@ -6526,12 +6816,15 @@ - self: T9 src: T1 __ret_0: T9 + - self: T10 + src: T1 + __ret_0: T10 - func: aten::split_copy.Tensor namespace: edge inherits: aten::split_copy.Tensor type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6540,7 +6833,7 @@ namespace: edge inherits: aten::split_with_sizes_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6549,7 +6842,7 @@ namespace: edge inherits: aten::sqrt type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -6562,7 +6855,7 @@ namespace: edge inherits: aten::squeeze_copy.dim type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6571,7 +6864,7 @@ namespace: edge inherits: aten::squeeze_copy.dims type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -6580,7 +6873,7 @@ namespace: edge inherits: aten::stack type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - tensors: T0 __ret_0: T0 @@ -6590,7 +6883,7 @@ inherits: aten::sub.Scalar type_alias: T0: [Byte] - T1: [Byte, Char, Float, Int, Long, Short] + T1: [Byte, Char, Float, Int, Long, Short, UInt16] T2: [Char] T3: [Double] T4: [Float] @@ -6599,6 +6892,7 @@ T7: [Int] T8: [Long] T9: [Short] + T10: [UInt16] type_constraint: - self: T0 other: T4 @@ -6696,15 +6990,19 @@ other: T7 alpha: T7 __ret_0: T9 + - self: T10 + other: T4 + alpha: T5 + __ret_0: T4 - func: aten::sub.Tensor namespace: edge inherits: aten::sub.Tensor type_alias: T0: [Byte] - T1: [Byte, Char, Double, Float, Half, Int, Long, Short] - T2: [Byte, Char, Float, Half, Int, Long, Short] - T3: [Byte, Char, Half, Int, Long, Short] + T1: [Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T2: [Byte, Char, Float, Half, Int, Long, Short, UInt16] + T3: [Byte, Char, Half, Int, Long, Short, UInt16] T4: [Byte, Char, Int, Long, Short] T5: [Byte, Char, Int, Short] T6: [Byte, Char, Short] @@ -6718,6 +7016,7 @@ T14: [Int] T15: [Long] T16: [Short] + T17: [UInt16] type_constraint: - self: T0 other: T0 @@ -6843,6 +7142,10 @@ other: T16 alpha: T12 __ret_0: T10 + - self: T10 + other: T17 + alpha: T12 + __ret_0: T10 - self: T11 other: T0 alpha: T12 @@ -6883,6 +7186,10 @@ other: T16 alpha: T12 __ret_0: T11 + - self: T11 + other: T17 + alpha: T12 + __ret_0: T11 - self: T13 other: T0 alpha: T12 @@ -6923,6 +7230,10 @@ other: T16 alpha: T12 __ret_0: T13 + - self: T13 + other: T17 + alpha: T12 + __ret_0: T13 - self: T14 other: T5 alpha: T14 @@ -6971,13 +7282,25 @@ other: T13 alpha: T12 __ret_0: T13 + - self: T17 + other: T10 + alpha: T12 + __ret_0: T10 + - self: T17 + other: T11 + alpha: T12 + __ret_0: T11 + - self: T17 + other: T13 + alpha: T12 + __ret_0: T13 - func: aten::sum.dim_IntList namespace: edge inherits: aten::sum.dim_IntList type_alias: T0: [Bool] - T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T1: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] T2: [Byte] T3: [Char] T4: [Double] @@ -7019,7 +7342,7 @@ namespace: edge inherits: aten::t_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -7028,7 +7351,7 @@ namespace: edge inherits: aten::tan type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -7041,7 +7364,7 @@ namespace: edge inherits: aten::tanh type_alias: - T0: [Bool, Byte, Char, Float, Int, Long, Short] + T0: [Bool, Byte, Char, Float, Int, Long, Short, UInt16] T1: [Double, Half] T2: [Float] type_constraint: @@ -7054,7 +7377,7 @@ namespace: edge inherits: aten::transpose_copy.int type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -7072,7 +7395,7 @@ namespace: edge inherits: aten::unbind_copy.int type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -7081,7 +7404,7 @@ namespace: edge inherits: aten::unsqueeze_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -7099,7 +7422,7 @@ namespace: edge inherits: aten::view_copy type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - self: T0 __ret_0: T0 @@ -7110,9 +7433,9 @@ type_alias: T0: [Bool] T1: [Bool, Byte] - T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] - T3: [Bool, Byte, Char, Float, Half, Int, Long, Short] - T4: [Bool, Byte, Char, Half, Int, Long, Short] + T2: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] + T3: [Bool, Byte, Char, Float, Half, Int, Long, Short, UInt16] + T4: [Bool, Byte, Char, Half, Int, Long, Short, UInt16] T5: [Bool, Byte, Char, Int, Long, Short] T6: [Bool, Byte, Char, Int, Short] T7: [Bool, Byte, Char, Short] @@ -7127,6 +7450,7 @@ T16: [Int] T17: [Long] T18: [Short] + T19: [UInt16] type_constraint: - condition: T0 self: T1 @@ -7352,6 +7676,10 @@ self: T13 other: T18 __ret_0: T13 + - condition: T1 + self: T13 + other: T19 + __ret_0: T13 - condition: T1 self: T14 other: T0 @@ -7388,6 +7716,10 @@ self: T14 other: T18 __ret_0: T14 + - condition: T1 + self: T14 + other: T19 + __ret_0: T14 - condition: T1 self: T15 other: T0 @@ -7424,6 +7756,10 @@ self: T15 other: T18 __ret_0: T15 + - condition: T1 + self: T15 + other: T19 + __ret_0: T15 - condition: T1 self: T16 other: T0 @@ -7532,6 +7868,18 @@ self: T18 other: T18 __ret_0: T18 + - condition: T1 + self: T19 + other: T13 + __ret_0: T13 + - condition: T1 + self: T19 + other: T14 + __ret_0: T14 + - condition: T1 + self: T19 + other: T15 + __ret_0: T15 - condition: T9 self: T1 other: T9 @@ -7617,7 +7965,7 @@ namespace: edge inherits: aten::zeros type_alias: - T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short] + T0: [Bool, Byte, Char, Double, Float, Half, Int, Long, Short, UInt16] type_constraint: - dtype: T0 __ret_0: T0 diff --git a/exir/dialects/edge/test/test_edge_yaml.py b/exir/dialects/edge/test/test_edge_yaml.py index 1571cf5b01..1ff9143420 100644 --- a/exir/dialects/edge/test/test_edge_yaml.py +++ b/exir/dialects/edge/test/test_edge_yaml.py @@ -187,6 +187,7 @@ def test_tensor_list_supported(self) -> None: "Int", "Long", "Short", + "UInt16", ) ], ) From a4def9f821880781069add5f1da09b6259f6b4bf Mon Sep 17 00:00:00 2001 From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com> Date: Thu, 16 Jan 2025 09:48:23 +0100 Subject: [PATCH 16/40] Make ArmPassManager aware of TosaSpecification (#7668) - Pass TosaSpecifcation to ArmPassManager. Based on this the PassManager can decide which passes should be run. - Also adds docstrings and renames some passes. Signed-off-by: Oscar Andersson --- backends/arm/_passes/arm_pass_manager.py | 115 ++++++++++-------- backends/arm/_passes/cast_int64_pass.py | 6 +- .../fold_qdq_with_annotated_qparams_pass.py | 13 +- .../_passes/meandim_to_averagepool_pass.py | 4 +- backends/arm/_passes/remove_clone_pass.py | 3 +- backends/arm/arm_backend.py | 23 ++-- backends/arm/quantizer/arm_quantizer.py | 11 +- backends/arm/test/common.py | 11 +- backends/arm/test/ops/test_avg_pool.py | 12 +- backends/arm/test/ops/test_clone.py | 13 +- backends/arm/test/ops/test_expand.py | 16 +-- backends/arm/test/ops/test_hardtanh.py | 18 +-- backends/arm/test/ops/test_max_pool.py | 16 ++- backends/arm/test/ops/test_permute.py | 15 ++- backends/arm/test/ops/test_relu.py | 16 +-- backends/arm/test/ops/test_repeat.py | 14 +-- backends/arm/test/ops/test_var.py | 16 +-- .../arm/test/passes/test_fold_qdq_pass.py | 16 +-- .../passes/test_meandim_to_averagepool2d.py | 8 +- backends/arm/test/tester/arm_tester.py | 6 +- examples/arm/aot_arm_compiler.py | 54 +++++--- 21 files changed, 225 insertions(+), 181 deletions(-) diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index 0b4e27e5aa..14972601b6 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -7,7 +7,6 @@ # pyre-unsafe -import torch from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import ( AnnotateChannelsLastDimOrder, ) @@ -47,7 +46,7 @@ ) from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass from executorch.backends.arm._passes.meandim_to_averagepool_pass import ( - ConvertMeanDimToAveragePool, + ConvertMeanDimToAveragePoolPass, ) from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass @@ -61,86 +60,98 @@ from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import ( UnsqueezeScalarPlaceholdersPass, ) +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass from executorch.exir import ExportedProgram -from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_manager import PassManager +from torch.fx import GraphModule class ArmPassManager(PassManager): - def _transform(self, graph_module: torch.fx.GraphModule): + def __init__(self, tosa_spec: TosaSpecification) -> None: + self.tosa_spec = tosa_spec + super().__init__() + + def _transform(self, graph_module: GraphModule): return self(graph_module).graph_module - def transform_to_backend_pipeline(self, exported_program: ExportedProgram): - """Apply passes before transforming program to backend""" + def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule: self.add_pass(FuseQuantizedActivationPass()) + self.add_pass(RemoveGetItemPass()) + self.add_pass(ConvertSplitToSlicePass()) + self.add_pass(ConvertMmToBmmPass()) self.add_pass(DecomposeLinearPass()) + self.add_pass(ConvertMeanDimToAveragePoolPass()) + + self.add_pass(AnnotateDecomposedMatmulPass()) + self.add_pass(QuantizeFullArgument()) + self.add_pass(FoldAndAnnotateQParamsPass()) + self.add_pass(RetraceFoldedDtypesPass()) + self.add_pass(InsertTableOpsPass(exported_program)) + + self.add_pass(RemoveClonePass()) + self.add_pass(SizeAdjustConv2DPass()) + self.add_pass(ConvertExpandCopyToRepeatPass()) + self.add_pass(UnsqueezeBeforeRepeatPass()) + self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program)) + self.add_pass(CastInt64ToInt32Pass(exported_program)) + self.add_pass(MatchArgRanksPass(exported_program)) + self.add_pass(KeepDimsFalseToSqueezePass()) + self.add_pass(Conv1dUnsqueezePass(exported_program)) + self.add_pass(DecomposeSelectPass()) + + self.add_pass(AnnotateChannelsLastDimOrder()) + + return self._transform(exported_program.graph_module) + + def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule: + + self.add_pass(FuseQuantizedActivationPass()) self.add_pass(RemoveGetItemPass()) + self.add_pass(ConvertSplitToSlicePass()) + self.add_pass(ConvertMmToBmmPass()) + self.add_pass(DecomposeLinearPass()) self.add_pass(DecomposeLayerNormPass()) self.add_pass(DecomposeVarPass()) - self.add_pass(ConvertMeanDimToAveragePool()) self.add_pass(DecomposeMeanDimPass()) - self.add_pass(ConvertSplitToSlicePass()) - self.add_pass(ConvertMmToBmmPass()) - # TODO MLETORCH-558 + self.add_pass(ConvertMeanDimToAveragePoolPass()) + self.add_pass(DecomposeDivPass()) + self.add_pass(DecomposeSoftmaxesPass()) + self.add_pass(AnnotateDecomposedMatmulPass()) self.add_pass(QuantizeFullArgument()) - self.add_pass( - FoldAndAnnotateQParamsPass( - [ - exir_ops.edge.aten.minimum.default, - exir_ops.edge.aten.maximum.default, - exir_ops.edge.aten.add.Tensor, - exir_ops.edge.aten.avg_pool2d.default, - exir_ops.edge.aten.bmm.default, - exir_ops.edge.aten.cat.default, - exir_ops.edge.aten.convolution.default, - exir_ops.edge.aten.clone.default, - exir_ops.edge.aten.exp.default, - exir_ops.edge.aten.expand_copy.default, - exir_ops.edge.aten.full.default, - exir_ops.edge.aten.hardtanh.default, - exir_ops.edge.aten.log.default, - exir_ops.edge.aten.max_pool2d.default, - exir_ops.edge.aten.mul.Tensor, - exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.reciprocal.default, - exir_ops.edge.aten.relu.default, - exir_ops.edge.aten.repeat.default, - exir_ops.edge.aten.rsqrt.default, - exir_ops.edge.aten.select_copy.int, - exir_ops.edge.aten.sigmoid.default, - exir_ops.edge.aten.slice_copy.Tensor, - exir_ops.edge.aten.squeeze_copy.dims, - exir_ops.edge.aten.sub.Tensor, - exir_ops.edge.aten.sum.dim_IntList, - exir_ops.edge.aten.tanh.default, - exir_ops.edge.aten.unsqueeze_copy.default, - exir_ops.edge.aten.upsample_nearest2d.vec, - exir_ops.edge.aten.view_copy.default, - ] - ) - ) + self.add_pass(FoldAndAnnotateQParamsPass()) self.add_pass(RetraceFoldedDtypesPass()) self.add_pass(InsertTableOpsPass(exported_program)) + + self.add_pass(RemoveClonePass()) + self.add_pass(SizeAdjustConv2DPass()) self.add_pass(ConvertExpandCopyToRepeatPass()) self.add_pass(UnsqueezeBeforeRepeatPass()) - self.add_pass(CastInt64ToInt32Pass(exported_program)) self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program)) - self.add_pass(SizeAdjustConv2DPass()) - self.add_pass(RemoveClonePass()) + self.add_pass(CastInt64ToInt32Pass(exported_program)) self.add_pass(MatchArgRanksPass(exported_program)) - self.add_pass(DecomposeDivPass()) self.add_pass(KeepDimsFalseToSqueezePass()) self.add_pass(Conv1dUnsqueezePass(exported_program)) - self.add_pass(DecomposeSoftmaxesPass()) self.add_pass(DecomposeSelectPass()) + self.add_pass(AnnotateChannelsLastDimOrder()) return self._transform(exported_program.graph_module) - def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule): + def transform_to_backend_pipeline(self, exported_program: ExportedProgram): + """Apply passes before transforming program to backend""" + if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"): + return self._tosa_080_BI_pipeline(exported_program) + elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"): + return self._tosa_080_MI_pipeline(exported_program) + else: + raise NotImplementedError( + f"No pass pipeline implemented for {self.tosa_spec=}" + ) + + def transform_for_annotation_pipeline(self, graph_module: GraphModule): self.add_pass(ScalarsToAttributePass()) self.add_pass(DecomposeLayerNormPass()) self.add_pass(DecomposeVarPass()) diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py index aab6ed8eb4..dffa4c199a 100644 --- a/backends/arm/_passes/cast_int64_pass.py +++ b/backends/arm/_passes/cast_int64_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -17,6 +17,10 @@ class CastInt64ToInt32Pass(ExportPass): + """ + Cast int64 buffers to int32 if the int64 data is in int32 range. + """ + def __init__(self, exported_program: torch.export.ExportedProgram): super(CastInt64ToInt32Pass, self).__init__() self.exported_program = exported_program diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py index 045506f19d..5a6b06d100 100644 --- a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py +++ b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -6,7 +6,7 @@ import copy -from typing import cast, Dict, Iterable, Set, Tuple +from typing import cast, Dict, Set, Tuple from executorch.backends.arm.tosa_quant_utils import QuantArgs @@ -55,7 +55,7 @@ def get_output_qparams(node: Node) -> dict[int, QuantArgs]: class FoldAndAnnotateQParamsPass(ExportPass): """ A pass that walks the graph and removes any DQ and Q nodes before and after the target - node in the supplied list of operators. + node. The quantization parameters from the DQ/Q nodes are stored as meta values to be accessible for later lowering and serialization passes. The assumption is that the quantization annotatation adds DQ nodes for all tensor @@ -82,9 +82,8 @@ class FoldAndAnnotateQParamsPass(ExportPass): """ - def __init__(self, targeted_ops: Iterable[EdgeOpOverload]) -> None: + def __init__(self) -> None: super().__init__() - self.targeted_ops = targeted_ops def fold_and_annotate_arg( self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int @@ -131,7 +130,7 @@ def call(self, graph_module: GraphModule) -> PassResult: # Loop over the graph nodes and find any node in the 'targeted_ops' list. for n in graph_module.graph.nodes: n = cast(Node, n) - if n.op != "call_function" or n.target not in self.targeted_ops: + if n.op != "call_function": continue # Make sure we haven't already set qparams meta information on the node @@ -180,7 +179,7 @@ class QuantizeFullArgument(ExportPass): def call(self, graph_module: GraphModule) -> PassResult: modified = False - # Loop over the graph nodes and find any node in the 'targeted_ops' list. + # Loop over the graph nodes and find full.default nodes. for n in graph_module.graph.nodes: n = cast(Node, n) if n.target != exir_ops.edge.aten.full.default: diff --git a/backends/arm/_passes/meandim_to_averagepool_pass.py b/backends/arm/_passes/meandim_to_averagepool_pass.py index 0974eac740..9a75519150 100644 --- a/backends/arm/_passes/meandim_to_averagepool_pass.py +++ b/backends/arm/_passes/meandim_to_averagepool_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -16,7 +16,7 @@ Argument = Any -class ConvertMeanDimToAveragePool(ExportPass): +class ConvertMeanDimToAveragePoolPass(ExportPass): """ Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation. """ diff --git a/backends/arm/_passes/remove_clone_pass.py b/backends/arm/_passes/remove_clone_pass.py index ac992ce2a0..9542a4097a 100644 --- a/backends/arm/_passes/remove_clone_pass.py +++ b/backends/arm/_passes/remove_clone_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -11,6 +11,7 @@ class RemoveClonePass(ExportPass): + """Remove all clones from graph_module""" def call_operator(self, op, args, kwargs, meta): if op != exir_ops.edge.aten.clone.default: diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 601cac3692..7bdbdf3947 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -50,7 +50,7 @@ def __init__(self): self.output_format = None self.path_for_intermediates = None self.quantize_io = False - self.tosa_version = None + self.tosa_spec = None self.input_order = None def ethosu_compile_spec( @@ -92,11 +92,13 @@ def ethosu_compile_spec( if "u55" in config: # Add the Ethos-U55 extension marker base_tosa_version += "+u55" - self.tosa_version = TosaSpecification.create_from_string(base_tosa_version) + self.tosa_spec = TosaSpecification.create_from_string(base_tosa_version) return self - def tosa_compile_spec(self, tosa_version: str) -> "ArmCompileSpecBuilder": + def tosa_compile_spec( + self, tosa_spec: str | TosaSpecification + ) -> "ArmCompileSpecBuilder": """ Generate compile spec for TOSA flatbuffer output """ @@ -104,7 +106,12 @@ def tosa_compile_spec(self, tosa_version: str) -> "ArmCompileSpecBuilder": self.output_format is None ), f"Output format already set: {self.output_format}" self.output_format = "tosa" - self.tosa_version = TosaSpecification.create_from_string(tosa_version) + if isinstance(tosa_spec, TosaSpecification): + self.tosa_spec = tosa_spec + elif isinstance(tosa_spec, str): + self.tosa_spec = TosaSpecification.create_from_string(tosa_spec) + else: + raise RuntimeError(f"Invalid type for {tosa_spec}!") return self def dump_intermediate_artifacts_to( @@ -138,12 +145,10 @@ def build(self) -> List[CompileSpec]: """ Generate a list of compile spec objects from the builder """ - assert self.tosa_version + assert self.tosa_spec # Always supply a TOSA version - self.compile_spec = [ - CompileSpec("tosa_version", str(self.tosa_version).encode()) - ] + self.compile_spec = [CompileSpec("tosa_version", str(self.tosa_spec).encode())] if self.output_format == "vela": self.compile_spec += [ @@ -253,7 +258,7 @@ def preprocess( # noqa: C901 # Converted output for this subgraph, serializer needs path early as it emits # const data directly. Path created and data written only in debug builds. tosa_graph = ts.TosaSerializer(artifact_path) - graph_module = ArmPassManager().transform_to_backend_pipeline( + graph_module = ArmPassManager(tosa_spec).transform_to_backend_pipeline( exported_program=edge_program ) diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index fe104db972..cba66cfe56 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -24,6 +24,7 @@ from executorch.backends.arm.quantizer.quantization_annotator import annotate_graph from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig +from executorch.backends.arm.tosa_specification import TosaSpecification from torch.ao.quantization.fake_quantize import ( FakeQuantize, FusedMovingAvgObsFakeQuantize, @@ -205,8 +206,10 @@ def not_module_type_or_name_filter(n: Node) -> bool: class ArmQuantizer(Quantizer): - def __init__(self) -> None: + + def __init__(self, tosa_spec: TosaSpecification) -> None: super().__init__() + self.tosa_spec = tosa_spec self.global_config: Optional[QuantizationConfig] = None self.io_config: Optional[QuantizationConfig] = None self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {} @@ -250,7 +253,9 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule: Currently transforms scalar values to tensor attributes. """ - return ArmPassManager().transform_for_annotation_pipeline(graph_module=model) + return ArmPassManager(self.tosa_spec).transform_for_annotation_pipeline( + graph_module=model + ) def annotate(self, model: GraphModule) -> GraphModule: """Performs the quantization annotation on the graph. diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index bcd68cb173..c0f81bbe2e 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -12,6 +12,7 @@ from pathlib import Path from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.exir.backend.compile_spec_schema import CompileSpec @@ -53,15 +54,17 @@ def maybe_get_tosa_collate_path() -> str | None: return None -def get_tosa_compile_spec(tosa_version: str, custom_path=None) -> list[CompileSpec]: +def get_tosa_compile_spec( + tosa_spec: str | TosaSpecification, custom_path=None +) -> list[CompileSpec]: """ Default compile spec for TOSA tests. """ - return get_tosa_compile_spec_unbuilt(tosa_version, custom_path).build() + return get_tosa_compile_spec_unbuilt(tosa_spec, custom_path).build() def get_tosa_compile_spec_unbuilt( - tosa_version: str, custom_path=None + tosa_spec: str | TosaSpecification, custom_path=None ) -> ArmCompileSpecBuilder: """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify the compile spec before calling .build() to finalize it. @@ -73,7 +76,7 @@ def get_tosa_compile_spec_unbuilt( os.makedirs(custom_path, exist_ok=True) compile_spec_builder = ( ArmCompileSpecBuilder() - .tosa_compile_spec(tosa_version) + .tosa_compile_spec(tosa_spec) .dump_intermediate_artifacts_to(custom_path) .set_quantize_io(True) ) diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py index bc37fbb136..16396950dc 100644 --- a/backends/arm/test/ops/test_avg_pool.py +++ b/backends/arm/test/ops/test_avg_pool.py @@ -18,6 +18,7 @@ ) from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -73,14 +74,14 @@ def _test_avgpool2d_tosa_MI_pipeline( def _test_avgpool2d_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", - ), + compile_spec=compile_spec, ) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() @@ -100,7 +101,8 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.tensor], ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) tester = ( ArmTester( module, diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index 300ebb6f37..1d46173a68 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -19,6 +19,7 @@ ) from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize @@ -60,13 +61,11 @@ def _test_clone_tosa_MI_pipeline( def _test_clone_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.clone.default": 1}) diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py index 915b1fe7e0..116f5d64e8 100644 --- a/backends/arm/test/ops/test_expand.py +++ b/backends/arm/test/ops/test_expand.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -22,6 +22,7 @@ ) from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec @@ -64,13 +65,11 @@ def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tupl ) def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.expand.default": 1}) @@ -85,7 +84,8 @@ def _test_expand_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl def _test_expand_ethosu_BI_pipeline( self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) tester = ( ArmTester( module, diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py index 7125920c8c..cf0a49827a 100644 --- a/backends/arm/test/ops/test_hardtanh.py +++ b/backends/arm/test/ops/test_hardtanh.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -17,9 +17,10 @@ ArmQuantizer, get_symmetric_quantization_config, ) - from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from parameterized import parameterized @@ -71,13 +72,11 @@ def _test_hardtanh_tosa_MI_pipeline( def _test_hardtanh_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.hardtanh.default": 1}) @@ -93,7 +92,8 @@ def _test_hardtanh_tosa_BI_pipeline( def _test_hardtanh_tosa_ethosu_BI_pipeline( self, compile_spec, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) tester = ( ArmTester( module, diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py index 81f27beab4..e3502baf2c 100644 --- a/backends/arm/test/ops/test_max_pool.py +++ b/backends/arm/test/ops/test_max_pool.py @@ -19,6 +19,7 @@ ) from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec @@ -86,15 +87,11 @@ def _test_maxpool2d_tosa_MI_pipeline( def _test_maxpool2d_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", - ), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.max_pool2d.default": 1}) @@ -118,7 +115,8 @@ def _test_maxpool2d_tosa_ethos_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.tensor], ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) tester = ( ArmTester( module, diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index ec7ecaa81b..f0bfe23cff 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -17,9 +17,9 @@ ArmQuantizer, get_symmetric_quantization_config, ) - from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -74,13 +74,11 @@ def _test_permute_tosa_MI_pipeline( def _test_permute_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.permute.default": 1}) @@ -99,7 +97,8 @@ def _test_permute_ethos_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.Tensor], ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) tester = ( ArmTester( module, diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py index 5a7bd4f5ec..dd2bc4817e 100644 --- a/backends/arm/test/ops/test_relu.py +++ b/backends/arm/test/ops/test_relu.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -16,6 +16,7 @@ ) from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -64,13 +65,11 @@ def _test_relu_tosa_MI_pipeline( def _test_relu_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.relu.default": 1}) @@ -89,7 +88,8 @@ def _test_relu_ethosu_BI_pipeline( module: torch.nn.Module, test_data: Tuple[torch.tensor], ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( ArmTester( module, diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py index bad872792b..d35f699b72 100644 --- a/backends/arm/test/ops/test_repeat.py +++ b/backends/arm/test/ops/test_repeat.py @@ -19,6 +19,7 @@ ) from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec @@ -61,13 +62,11 @@ def _test_repeat_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tupl ) def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tuple): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .check_count({"torch.ops.aten.repeat.default": 1}) @@ -82,7 +81,8 @@ def _test_repeat_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: Tupl def _test_repeat_ethosu_pipeline( self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( ArmTester( module, diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py index e1fed05817..fd45c2d83f 100644 --- a/backends/arm/test/ops/test_var.py +++ b/backends/arm/test/ops/test_var.py @@ -15,9 +15,10 @@ ArmQuantizer, get_symmetric_quantization_config, ) - from executorch.backends.arm.test import common from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec @@ -112,13 +113,11 @@ def _test_var_tosa_BI_pipeline( test_data: torch.Tensor, target_str: str = None, ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80+BI") + compile_spec = common.get_tosa_compile_spec(tosa_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( - ArmTester( - module, - example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), - ) + ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() .to_edge() @@ -135,7 +134,8 @@ def _test_var_ethosu_BI_pipeline( test_data: torch.Tensor, target_str: str = None, ): - quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + quantizer = ArmQuantizer(tosa_spec).set_io(get_symmetric_quantization_config()) ( ArmTester( module, diff --git a/backends/arm/test/passes/test_fold_qdq_pass.py b/backends/arm/test/passes/test_fold_qdq_pass.py index cd7cf75139..ebb96faf90 100644 --- a/backends/arm/test/passes/test_fold_qdq_pass.py +++ b/backends/arm/test/passes/test_fold_qdq_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -16,8 +16,6 @@ from executorch.backends.xnnpack.test.tester.tester import RunPasses -from executorch.exir.dialects._ops import ops as exir_ops - class SimpleQuantizeModel(torch.nn.Module): def forward(self, x, y): @@ -27,16 +25,6 @@ def get_inputs(self): return (torch.rand(1, 1280, 7, 7), torch.rand(1, 1280, 7, 7)) -class FoldAndAnnotateQParamsPassTestClass(FoldAndAnnotateQParamsPass): - def __init__(self): - super(FoldAndAnnotateQParamsPassTestClass, self).__init__( - [ - exir_ops.edge.aten.add.Tensor, - exir_ops.edge.aten.maximum.default, - ] - ) - - class TestFoldAndAnnotateQParamsPass(unittest.TestCase): """ Tests the FoldAndAnnotateQParamsPass which folds dq/q nodes into @@ -49,7 +37,7 @@ def test_fold_qdq_pass(self): is removed from the representation. """ module = SimpleQuantizeModel() - test_pass_stage = RunPasses([FoldAndAnnotateQParamsPassTestClass]) + test_pass_stage = RunPasses([FoldAndAnnotateQParamsPass]) ( ArmTester( module, diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py index 93badc6435..e07e91ed72 100644 --- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py +++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -8,7 +8,7 @@ import torch from executorch.backends.arm._passes.meandim_to_averagepool_pass import ( - ConvertMeanDimToAveragePool, + ConvertMeanDimToAveragePoolPass, ) from executorch.backends.arm.test import common @@ -41,7 +41,7 @@ class TestMeandimToAveragePool2dPass(unittest.TestCase): def test_tosa_BI_meandim_to_averagepool(self): module = MeanDim() - test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + test_pass_stage = RunPasses([ConvertMeanDimToAveragePoolPass]) ( ArmTester( module, @@ -58,7 +58,7 @@ def test_tosa_BI_meandim_to_averagepool(self): def test_tosa_BI_meandim_no_modification(self): module = MeanDim2() - test_pass_stage = RunPasses([ConvertMeanDimToAveragePool]) + test_pass_stage = RunPasses([ConvertMeanDimToAveragePoolPass]) ( ArmTester( module, diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index abb192e308..e5c700ec3c 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -33,6 +33,7 @@ print_error_diffs, ) from executorch.backends.arm.tosa_mapping import extract_tensor_meta +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.xnnpack.test.tester import Tester from executorch.devtools.backend_debug import get_delegation_info @@ -184,8 +185,11 @@ def __init__( def quantize(self, quantize_stage: Optional[tester.Quantize] = None): if quantize_stage is None: + tosa_spec: TosaSpecification = TosaSpecification.create_from_compilespecs( + compile_specs=self.compile_spec + ) quantize_stage = tester.Quantize( - ArmQuantizer(), + ArmQuantizer(tosa_spec), get_symmetric_quantization_config(is_per_channel=False), ) return super().quantize(quantize_stage) diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 1208d79b06..bf7bbd87ef 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -16,12 +16,13 @@ from typing import Any, Dict, Optional, Tuple import torch -from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder +from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder, CompileSpec from executorch.backends.arm.arm_partitioner import ArmPartitioner from executorch.backends.arm.quantizer.arm_quantizer import ( ArmQuantizer, get_symmetric_quantization_config, ) +from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.arm.util.arm_model_evaluator import ( GenericModelEvaluator, @@ -88,6 +89,7 @@ def get_model_and_inputs_from_name(model_name: str) -> Tuple[torch.nn.Module, An def quantize( model: torch.nn.Module, model_name: str, + tosa_spec: TosaSpecification, example_inputs: Tuple[torch.Tensor], evaluator_name: str | None, evaluator_config: Dict[str, Any] | None, @@ -95,7 +97,7 @@ def quantize( """This is the official recommended flow for quantization in pytorch 2.0 export""" logging.info("Quantizing Model...") logging.debug(f"Original model: {model}") - quantizer = ArmQuantizer() + quantizer = ArmQuantizer(tosa_spec) # if we set is_per_channel to True, we also need to add out_variant of quantize_per_channel/dequantize_per_channel operator_config = get_symmetric_quantization_config(is_per_channel=False) @@ -260,7 +262,7 @@ def get_compile_spec( reorder_inputs: Optional[str] = None, system_config: Optional[str] = None, memory_mode: Optional[str] = None, -) -> ArmCompileSpecBuilder: +) -> list[CompileSpec]: spec_builder = None if target == "TOSA": spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI") @@ -513,17 +515,6 @@ def get_args(): # Quantize if required model_int8 = None - if args.quantize: - model = quantize( - model, args.model_name, example_inputs, args.evaluate, args.evaluate_config - ) - model_int8 = model - # Wrap quantized model back into an exported_program - exported_program = torch.export.export_for_training(model, example_inputs) - - if args.intermediates: - os.makedirs(args.intermediates, exist_ok=True) - if args.delegate: # As we can target multiple output encodings from ArmBackend, one must # be specified. @@ -534,6 +525,23 @@ def get_args(): args.system_config, args.memory_mode, ) + if args.quantize: + tosa_spec = TosaSpecification.create_from_compilespecs(compile_spec) + model = quantize( + model, + args.model_name, + tosa_spec, + example_inputs, + args.evaluate, + args.evaluate_config, + ) + model_int8 = model + # Wrap quantized model back into an exported_program + exported_program = torch.export.export_for_training(model, example_inputs) + + if args.intermediates: + os.makedirs(args.intermediates, exist_ok=True) + edge = to_edge_transform_and_lower( exported_program, partitioner=[ArmPartitioner(compile_spec)], @@ -542,7 +550,25 @@ def get_args(): _skip_dim_order=True, ), ) + else: + if args.quantize: + tosa_spec = TosaSpecification.create_from_string("TOSA-0.80.0+BI") + model = quantize( + model, + args.model_name, + tosa_spec, + example_inputs, + args.evaluate, + args.evaluate_config, + ) + model_int8 = model + # Wrap quantized model back into an exported_program + exported_program = torch.export.export_for_training(model, example_inputs) + + if args.intermediates: + os.makedirs(args.intermediates, exist_ok=True) + edge = to_edge_transform_and_lower( exported_program, compile_config=EdgeCompileConfig( From 0dba025e9c03d8c081d1a1086e8919e2a71b9a90 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Thu, 16 Jan 2025 10:13:21 +0100 Subject: [PATCH 17/40] Remove incorrectly xfailing split tests (#7648) Test failed due to checking for split op when the test contained a split_with_sizes op. Remove check. Signed-off-by: Erik Lundell --- backends/arm/test/ops/test_split.py | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py index a1ba53c881..b86e27f1a4 100644 --- a/backends/arm/test/ops/test_split.py +++ b/backends/arm/test/ops/test_split.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -101,7 +101,6 @@ def _test_split_ethosu_BI_pipeline( ) .quantize() .export() - .check(["torch.ops.aten.split.Tensor"]) .to_edge() .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) @@ -129,33 +128,14 @@ def test_split_two_out_tosa_MI(self, test_data: test_data_t): def test_split_tosa_BI(self, test_data: test_data_t): self._test_split_tosa_BI_pipeline(self.Split(), test_data) - @parameterized.expand( - [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]] - ) + @parameterized.expand(Split.test_data) def test_split_u55_BI(self, test_data: test_data_t): self._test_split_ethosu_BI_pipeline( common.get_u55_compile_spec(), self.Split(), test_data ) - # TODO MLETORCH-350 - @parameterized.expand([Split.test_data[3], Split.test_data[5]]) - @unittest.expectedFailure - def test_split_u55_BI_skip(self, test_data: test_data_t): - self._test_split_ethosu_BI_pipeline( - common.get_u55_compile_spec(), self.Split(), test_data - ) - - @parameterized.expand( - [Split.test_data[0], Split.test_data[1], Split.test_data[2], Split.test_data[4]] - ) + @parameterized.expand(Split.test_data) def test_split_u85_BI(self, test_data: test_data_t): self._test_split_ethosu_BI_pipeline( common.get_u85_compile_spec(), self.Split(), test_data ) - - @parameterized.expand([Split.test_data[3], Split.test_data[5]]) - @unittest.expectedFailure - def test_split_u85_BI_skip(self, test_data: test_data_t): - self._test_split_ethosu_BI_pipeline( - common.get_u85_compile_spec(), self.Split(), test_data - ) From d1b33cbbc0555935d9a0ac52f9ef7f0c1acfeccf Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Thu, 16 Jan 2025 14:40:54 +0100 Subject: [PATCH 18/40] Remove quantize_io from compile_spec (#7647) quantize_io was only used in arm_partitioner and is not needed there anymore when running the delegate in the graph. Signed-off-by: Erik Lundell --- backends/arm/arm_backend.py | 21 +------------- backends/arm/arm_partitioner.py | 4 --- backends/arm/test/common.py | 9 ------ backends/arm/test/ops/test_depthwise_conv.py | 30 +++++++------------- examples/arm/aot_arm_compiler.py | 2 -- 5 files changed, 11 insertions(+), 55 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 7bdbdf3947..b4512f37af 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -49,8 +49,7 @@ def __init__(self): self.compiler_flags = [] self.output_format = None self.path_for_intermediates = None - self.quantize_io = False - self.tosa_spec = None + self.tosa_version = None self.input_order = None def ethosu_compile_spec( @@ -123,14 +122,6 @@ def dump_intermediate_artifacts_to( self.path_for_intermediates = output_path return self - def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder": - """ - Quantization of inputs and dequantization of outputs for cases where - whole graph is quantized and method signature is not of quantized type. - """ - self.quantize_io = quantize_io - return self - def set_input_order( self, input_order: Optional[str] = None ) -> "ArmCompileSpecBuilder": @@ -170,9 +161,6 @@ def build(self) -> List[CompileSpec]: ) ) - if self.quantize_io: - self.compile_spec.append(CompileSpec("quantize_io", "True".encode())) - return self.compile_spec @@ -183,13 +171,6 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool: return False -def is_quantize_io(compile_specs: List[CompileSpec]) -> bool: - for spec in compile_specs: - if spec.key == "quantize_io" and spec.value.decode() == "True": - return True - return False - - def get_tosa_version(compile_spec: List[CompileSpec]) -> TosaSpecification: for spec in compile_spec: if spec.key == "tosa_version": diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py index ef4589abf5..cc4058c4c5 100644 --- a/backends/arm/arm_partitioner.py +++ b/backends/arm/arm_partitioner.py @@ -12,7 +12,6 @@ import torch from executorch.backends.arm.arm_backend import ( ArmBackend, - is_quantize_io, ) # usort: skip from executorch.backends.arm.operator_support.tosa_supported_operators import ( TOSASupportedOperators, @@ -89,9 +88,6 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool: node.meta["delegation_tag"] = tag partition_tags[tag] = self.delegation_spec - if not is_quantize_io(self.delegation_spec.compile_specs): - continue - # De-tag outmost q-nodes upwards and dq-nodes downwards. # De-tag if at least one input/ output is not part of partition. for node in partition.nodes: diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index c0f81bbe2e..eb97d9b1e7 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -78,14 +78,12 @@ def get_tosa_compile_spec_unbuilt( ArmCompileSpecBuilder() .tosa_compile_spec(tosa_spec) .dump_intermediate_artifacts_to(custom_path) - .set_quantize_io(True) ) return compile_spec_builder def get_u55_compile_spec( - quantize_io=True, custom_path=None, reorder_inputs=None, ) -> list[CompileSpec]: @@ -93,14 +91,12 @@ def get_u55_compile_spec( Default compile spec for Ethos-U55 tests. """ return get_u55_compile_spec_unbuilt( - quantize_io=quantize_io, custom_path=custom_path, reorder_inputs=reorder_inputs, ).build() def get_u85_compile_spec( - quantize_io=True, custom_path=None, reorder_inputs=None, ) -> list[CompileSpec]: @@ -108,14 +104,12 @@ def get_u85_compile_spec( Default compile spec for Ethos-U85 tests. """ return get_u85_compile_spec_unbuilt( - quantize_io=quantize_io, custom_path=custom_path, reorder_inputs=reorder_inputs, ).build() def get_u55_compile_spec_unbuilt( - quantize_io=True, custom_path=None, reorder_inputs=None, ) -> ArmCompileSpecBuilder: @@ -133,7 +127,6 @@ def get_u55_compile_spec_unbuilt( memory_mode="Shared_Sram", extra_flags="--debug-force-regor --output-format=raw", ) - .set_quantize_io(quantize_io) .dump_intermediate_artifacts_to(artifact_path) .set_input_order(reorder_inputs) ) @@ -141,7 +134,6 @@ def get_u55_compile_spec_unbuilt( def get_u85_compile_spec_unbuilt( - quantize_io=True, custom_path=None, reorder_inputs=None, ) -> list[CompileSpec]: @@ -157,7 +149,6 @@ def get_u85_compile_spec_unbuilt( memory_mode="Shared_Sram", extra_flags="--output-format=raw", ) - .set_quantize_io(quantize_io) .dump_intermediate_artifacts_to(artifact_path) .set_input_order(reorder_inputs) ) diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 22d9798aea..b8d69c89f1 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -259,46 +259,38 @@ def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module): @parameterized.expand(testsuite_conv2d[:4], skip_on_empty=True) @pytest.mark.corstone_fvp - def test_dw_conv2d_u55_BI( - self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True - ): + def test_dw_conv2d_u55_BI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u55_compile_spec(quantize_io=set_quantize_io), + common.get_u55_compile_spec(), model.get_inputs(), ) @parameterized.expand(testsuite_conv2d[4:], skip_on_empty=True) @pytest.mark.corstone_fvp @conftest.expectedFailureOnFVP # TODO: MLETORCH-516 - def test_dw_conv2d_u55_BI_xfails( - self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False - ): + def test_dw_conv2d_u55_BI_xfails(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u55_compile_spec(quantize_io=set_quantize_io), + common.get_u55_compile_spec(), model.get_inputs(), ) @parameterized.expand(testsuite_conv1d, skip_on_empty=True) @pytest.mark.corstone_fvp - def test_dw_conv1d_u55_BI( - self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True - ): + def test_dw_conv1d_u55_BI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u55_compile_spec(quantize_io=set_quantize_io), + common.get_u55_compile_spec(), model.get_inputs(), ) @parameterized.expand(testsuite_conv1d + testsuite_conv2d_u85) @pytest.mark.corstone_fvp - def test_dw_conv_u85_BI( - self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True - ): + def test_dw_conv_u85_BI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u85_compile_spec(quantize_io=set_quantize_io), + common.get_u85_compile_spec(), model.get_inputs(), ) @@ -306,11 +298,9 @@ def test_dw_conv_u85_BI( @parameterized.expand(testsuite_conv2d_u85_xfails) @pytest.mark.corstone_fvp @conftest.expectedFailureOnFVP - def test_dw_conv_u85_BI_xfails( - self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = True - ): + def test_dw_conv_u85_BI_xfails(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u85_compile_spec(quantize_io=set_quantize_io), + common.get_u85_compile_spec(), model.get_inputs(), ) diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index bf7bbd87ef..e842cde6bb 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -275,7 +275,6 @@ def get_compile_spec( memory_mode=memory_mode, extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", ) - .set_quantize_io(True) .set_input_order(reorder_inputs) ) elif "ethos-u85" in target: @@ -287,7 +286,6 @@ def get_compile_spec( memory_mode=memory_mode, extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", ) - .set_quantize_io(True) .set_input_order(reorder_inputs) ) From fc6b83ee5f2d1c9d38519c371e8378c1c51bffad Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Thu, 16 Jan 2025 10:41:36 -0500 Subject: [PATCH 19/40] Move ai-edge-model-explorer into devtools/install_requirements.sh (#7675) Summary: From top-level requirements, move to devtools/install_requirement.sh instead. ai-edge-model-explorer is too new and currently requires numpy<2 which conflicts with our recent upgrade. Let's not take core dependency on this tool yet. --- .lintrunner.toml | 1 + devtools/install_requirements.sh | 11 +++++++++++ devtools/visualization/visualization_utils.py | 9 ++++++++- devtools/visualization/visualization_utils_test.py | 9 ++++++++- install_requirements.py | 1 - pytest.ini | 1 + 6 files changed, 29 insertions(+), 3 deletions(-) create mode 100755 devtools/install_requirements.sh diff --git a/.lintrunner.toml b/.lintrunner.toml index fe8ecad1fc..dd75ea8f32 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -294,6 +294,7 @@ include_patterns = [ 'build/**/*.py', 'codegen/**/*.py', # 'devtools/**/*.py', + 'devtools/visualization/**/*.py', 'docs/**/*.py', # 'examples/**/*.py', # 'exir/**/*.py', diff --git a/devtools/install_requirements.sh b/devtools/install_requirements.sh new file mode 100755 index 0000000000..242bc70257 --- /dev/null +++ b/devtools/install_requirements.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Conflict: this requires numpy<2 whereas ExecuTorch core requires numpy>=2 +# Follow https://github.com/google-ai-edge/model-explorer/issues/277 for potential +# resolution +pip install ai-edge-model-explorer>=0.1.16 diff --git a/devtools/visualization/visualization_utils.py b/devtools/visualization/visualization_utils.py index a2ee4c6050..4d520a6636 100644 --- a/devtools/visualization/visualization_utils.py +++ b/devtools/visualization/visualization_utils.py @@ -8,9 +8,16 @@ import time from executorch.exir import EdgeProgramManager, ExecutorchProgramManager -from model_explorer import config, consts, visualize_from_config # type: ignore from torch.export.exported_program import ExportedProgram +try: + from model_explorer import config, consts, visualize_from_config # type: ignore +except ImportError: + print( + "Error: 'model_explorer' is not installed. Install using devtools/install_requirement.sh" + ) + raise + class SingletonModelExplorerServer: """Singleton context manager for starting a model-explorer server. diff --git a/devtools/visualization/visualization_utils_test.py b/devtools/visualization/visualization_utils_test.py index 89781ab4f4..dafefa7dfd 100644 --- a/devtools/visualization/visualization_utils_test.py +++ b/devtools/visualization/visualization_utils_test.py @@ -17,7 +17,14 @@ visualize, ) from executorch.exir import ExportedProgram -from model_explorer.config import ModelExplorerConfig # type: ignore + +try: + from model_explorer.config import ModelExplorerConfig # type: ignore +except ImportError: + print( + "Error: 'model_explorer' is not installed. Install using devtools/install_requirement.sh" + ) + raise @pytest.fixture diff --git a/install_requirements.py b/install_requirements.py index 26093cab84..adb26170cd 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -170,7 +170,6 @@ def python_is_compatible(): "tomli", # Imported by extract_sources.py when using python < 3.11. "wheel", # For building the pip package archive. "zstd", # Imported by resolve_buck.py. - "ai-edge-model-explorer>=0.1.16", # For visualizing ExportedPrograms ] # Assemble the list of requirements to actually install. diff --git a/pytest.ini b/pytest.ini index b7e9afb9b9..d0c27fdfab 100644 --- a/pytest.ini +++ b/pytest.ini @@ -14,6 +14,7 @@ addopts = # explicitly list out tests that are running successfully in oss examples/models/test devtools/ + --ignore=devtools/visualization/visualization_utils_test.py # examples examples/models/llama/tests examples/models/llama3_2_vision/preprocess From 6d78026648dd3e167842bc53a83cb7b075d63d2f Mon Sep 17 00:00:00 2001 From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com> Date: Thu, 16 Jan 2025 17:52:00 +0000 Subject: [PATCH 20/40] Arm backend: Remove the reordering of inputs flag (#7698) Remove the reordering of inputs flag - With updated Vela version input/output order is now preserved. - Remove re-order of inputs in compile spec - Remove re-order of inputs in aot_arm_compiler --- backends/arm/arm_backend.py | 10 ------- backends/arm/test/common.py | 8 ------ backends/arm/test/test_arm_baremetal.sh | 4 +-- examples/arm/aot_arm_compiler.py | 38 +++++++------------------ examples/arm/run.sh | 6 +--- 5 files changed, 13 insertions(+), 53 deletions(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index b4512f37af..979a246484 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -122,16 +122,6 @@ def dump_intermediate_artifacts_to( self.path_for_intermediates = output_path return self - def set_input_order( - self, input_order: Optional[str] = None - ) -> "ArmCompileSpecBuilder": - """ - Reorder the inputs coming in. This may be required when inputs > 1. - And while using the U55/U85 CompileSpec. - """ - self.input_order = input_order - return self - def build(self) -> List[CompileSpec]: """ Generate a list of compile spec objects from the builder diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index eb97d9b1e7..f1b9762572 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -85,33 +85,28 @@ def get_tosa_compile_spec_unbuilt( def get_u55_compile_spec( custom_path=None, - reorder_inputs=None, ) -> list[CompileSpec]: """ Default compile spec for Ethos-U55 tests. """ return get_u55_compile_spec_unbuilt( custom_path=custom_path, - reorder_inputs=reorder_inputs, ).build() def get_u85_compile_spec( custom_path=None, - reorder_inputs=None, ) -> list[CompileSpec]: """ Default compile spec for Ethos-U85 tests. """ return get_u85_compile_spec_unbuilt( custom_path=custom_path, - reorder_inputs=reorder_inputs, ).build() def get_u55_compile_spec_unbuilt( custom_path=None, - reorder_inputs=None, ) -> ArmCompileSpecBuilder: """Get the ArmCompileSpecBuilder for the Ethos-U55 tests, to modify the compile spec before calling .build() to finalize it. @@ -128,14 +123,12 @@ def get_u55_compile_spec_unbuilt( extra_flags="--debug-force-regor --output-format=raw", ) .dump_intermediate_artifacts_to(artifact_path) - .set_input_order(reorder_inputs) ) return compile_spec def get_u85_compile_spec_unbuilt( custom_path=None, - reorder_inputs=None, ) -> list[CompileSpec]: """Get the ArmCompileSpecBuilder for the Ethos-U85 tests, to modify the compile spec before calling .build() to finalize it. @@ -150,7 +143,6 @@ def get_u85_compile_spec_unbuilt( extra_flags="--output-format=raw", ) .dump_intermediate_artifacts_to(artifact_path) - .set_input_order(reorder_inputs) ) return compile_spec diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh index 377e1e2eb8..9f2fa4c17d 100755 --- a/backends/arm/test/test_arm_baremetal.sh +++ b/backends/arm/test/test_arm_baremetal.sh @@ -96,12 +96,12 @@ test_run_ethosu_fvp() { # End to End model tests # Ethos-U55 echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U55" examples/arm/run.sh --target=ethos-u55-128 --model_name=mv2 - examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm --reorder_inputs=1,0,2 + examples/arm/run.sh --target=ethos-u55-128 --model_name=lstm # Ethos-U85 echo "${TEST_SUITE_NAME}: Test ethos-u target Ethos-U85" examples/arm/run.sh --target=ethos-u85-128 --model_name=mv2 - examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm --reorder_inputs=1,0,2 + examples/arm/run.sh --target=ethos-u85-128 --model_name=lstm } ${TEST_SUITE} \ No newline at end of file diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index e842cde6bb..9563be93aa 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -259,7 +259,6 @@ def get_calibration_data( def get_compile_spec( target: str, intermediates: Optional[str] = None, - reorder_inputs: Optional[str] = None, system_config: Optional[str] = None, memory_mode: Optional[str] = None, ) -> list[CompileSpec]: @@ -267,26 +266,18 @@ def get_compile_spec( if target == "TOSA": spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI") elif "ethos-u55" in target: - spec_builder = ( - ArmCompileSpecBuilder() - .ethosu_compile_spec( - target, - system_config=system_config, - memory_mode=memory_mode, - extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", - ) - .set_input_order(reorder_inputs) + spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec( + target, + system_config=system_config, + memory_mode=memory_mode, + extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", ) elif "ethos-u85" in target: - spec_builder = ( - ArmCompileSpecBuilder() - .ethosu_compile_spec( - target, - system_config=system_config, - memory_mode=memory_mode, - extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", - ) - .set_input_order(reorder_inputs) + spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec( + target, + system_config=system_config, + memory_mode=memory_mode, + extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", ) if intermediates is not None: @@ -429,14 +420,6 @@ def get_args(): required=False, help="Location for outputs, if not the default of cwd.", ) - parser.add_argument( - "-r", - "--reorder_inputs", - type=str, - required=False, - default=None, - help="Provide the order of the inputs. This can be required when inputs > 1.", - ) parser.add_argument( "--system_config", required=False, @@ -519,7 +502,6 @@ def get_args(): compile_spec = get_compile_spec( args.target, args.intermediates, - args.reorder_inputs, args.system_config, args.memory_mode, ) diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 5d492cfcb1..d47e2620e6 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -29,7 +29,6 @@ build_with_etdump=false build_type="Release" extra_build_flags="" build_only=false -reorder_inputs="" system_config="" memory_mode="" @@ -46,7 +45,6 @@ help() { echo " --extra_build_flags Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none " echo " --build_only Only build, don't run FVP" echo " --scratch-dir= Path to your Ethos-U scrach dir if you not using default" - echo " --reorder_inputs= Reorder the inputs. This can be required when inputs > 1." echo " --system_config= System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets." echo " NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt." echo " --memory_mode= Memory mode to select from the Vela configuration file (see vela.ini), e.g. Shared_Sram/Sram_Only. Default: 'Shared_Sram' for Ethos-U55 targets, 'Sram_Only' for Ethos-U85 targets" @@ -66,7 +64,6 @@ for arg in "$@"; do --extra_build_flags=*) extra_build_flags="${arg#*=}";; --build_only) build_only=true ;; --scratch-dir=*) root_dir="${arg#*=}";; - --reorder_inputs=*) reorder_inputs="${arg#*=}";; --system_config=*) system_config="${arg#*=}";; --memory_mode=*) memory_mode="${arg#*=}";; *) @@ -151,7 +148,7 @@ function generate_pte_file() { # We are using the aot_lib from build_quantization_aot_lib below SO_LIB=$(find cmake-out-aot-lib -name libquantized_ops_aot_lib.${SO_EXT}) - local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --reorder_inputs=${reorder_inputs} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}" + local ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --output ${output_folder} --so_library=$SO_LIB --system_config=${system_config} --memory_mode=${memory_mode}" echo "CALL ${ARM_AOT_CMD}" >&2 ${ARM_AOT_CMD} 1>&2 @@ -372,7 +369,6 @@ if [[ -z "$model_name" ]]; then else test_model=( "$model_name" ) model_compiler_flags=( "$aot_arm_compiler_flags" ) - reorder_inputs=( "$reorder_inputs" ) fi # loop over running the AoT flow and executing the model on device From af7613c7a5dd39e480aafc1146cd78f55d40bbbb Mon Sep 17 00:00:00 2001 From: Kimish Patel Date: Thu, 16 Jan 2025 09:59:32 -0800 Subject: [PATCH 21/40] [ExecuTorch][Llama] Split custom sdpa op and kv cache (#7412) * [ExecuTorch][Llama] Split custom sdpa op and kv cache Summary: This enables us to do more easier module swap with model definitions from torchtune Test Plan: CI Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned] * Update on "[ExecuTorch][Llama] Split custom sdpa op and kv cache" Summary: This enables us to do more easier module swap with model definitions from torchtune Test Plan: CI Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned] --- examples/models/llama/export_llama_lib.py | 2 + .../quantized_kv_cache.py | 79 +++++++++++++++++-- .../llama/source_transformation/sdpa.py | 39 +++------ .../test_sdpa_with_quantized_kv_cache.py | 17 +++- 4 files changed, 101 insertions(+), 36 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index a562bdf13f..69980990cf 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -56,6 +56,7 @@ get_quant_weight_transform, ) from .source_transformation.quantized_kv_cache import ( + replace_kv_cache_with_custom_kv_cache, replace_kv_cache_with_quantized_kv_cache, ) from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm @@ -1058,6 +1059,7 @@ def _get_source_transforms( # noqa transforms.append(materialze_broadcast_of_rope_freq_cis) if args.use_sdpa_with_kv_cache: + transforms.append(replace_kv_cache_with_custom_kv_cache) transforms.append(replace_sdpa_with_custom_op) if args.quantize_kv_cache: diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py index a0c8c2fd93..d8ac99656f 100644 --- a/examples/models/llama/source_transformation/quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/quantized_kv_cache.py @@ -6,6 +6,7 @@ import logging from enum import Enum +from typing import Tuple import torch import torch.nn as nn @@ -44,7 +45,6 @@ def __init__( QuantizedCacheType.AffineSymmetric, QuantizedCacheType.AffineAsymmetric, ): - raise ValueError( f"Only affine symmetric and asymmetric cache types are supported: got {cache_type}" ) @@ -81,10 +81,11 @@ def __init__( ) def _quantize(self, value): - scales, zero_points = ( - torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( - value, self.quantized_cache_dtype - ) + ( + scales, + zero_points, + ) = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default( + value, self.quantized_cache_dtype ) quantized_value = torch.ops.quantized_decomposed.quantize_per_token( value, @@ -262,3 +263,71 @@ def replace_kv_cache_with_quantized_kv_cache(module): else: replace_kv_cache_with_quantized_kv_cache(child) return module + + +class CustomKVCache(nn.Module): + def __init__( + self, + max_batch_size: int, + max_seq_length: int, + n_heads: int, + head_dim: int, + dtype=torch.float32, + ): + super().__init__() + self.max_seq_length = max_seq_length + cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim) + + self.max_batch_size = max_batch_size + self.n_heads = n_heads + self.head_dim = head_dim + self.register_buffer( + "k_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu") + ) + self.register_buffer( + "v_cache", torch.zeros(cache_shape, dtype=dtype, device="cpu") + ) + + def update( + self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # input_pos: [S], k_val: [B, S, H, D] + start_pos = input_pos[0].item() + _ = torch.ops.llama.update_cache(k_val, self.k_cache, start_pos) + _ = torch.ops.llama.update_cache(v_val, self.v_cache, start_pos) + return self.k_cache, self.v_cache + + +def replace_kv_cache_with_custom_kv_cache(module): + r""" + Replace KVCache with CustomKVCache. This modifies the model in place. + At the moment custom kv cache only supports cache with shape + [B, S, H, D] as opposed to [B, H, S, D] + This is because the custom op treats second dim as sequence dim. + Future work: support [B, H, S, D] + """ + logging.warning( + "Replacing KVCache with CustomKVCache. This modifies the model in place." + ) + for name, child in module.named_children(): + if isinstance(child, KVCache): + cache_shape = child.k_cache.shape + cache_dtype = child.k_cache.dtype + assert ( + child.is_transposed is False + ), "CustomKVCache does not support transposed cache" + max_batch_size, max_seq_length, n_heads, head_dim = cache_shape + setattr( + module, + name, + CustomKVCache( + max_batch_size, + max_seq_length, + n_heads, + head_dim, + dtype=cache_dtype, + ), + ) + else: + replace_kv_cache_with_custom_kv_cache(child) + return module diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py index 59bfbe6f95..4d4b3bf7f5 100644 --- a/examples/models/llama/source_transformation/sdpa.py +++ b/examples/models/llama/source_transformation/sdpa.py @@ -56,33 +56,16 @@ def forward( k_cache = self.kv_cache.k_cache v_cache = self.kv_cache.v_cache - if hasattr(self.kv_cache, "quantized_cache_dtype"): - # updated quantize cache, scale and zero points - # returns dequantized kv cache - # Not most optimal. Optimizations to follow next - k_cache, v_cache = self.kv_cache.update(input_pos, k, v) - output = torch.ops.llama.custom_sdpa( - q, - k_cache, - v_cache, - input_pos[0].item(), - None, # Attention mask - 0, # dropout probability. Ignored by the code - True, # is_causal - ) - else: - output = torch.ops.llama.sdpa_with_kv_cache( - q, - k, - v, - k_cache, - v_cache, - input_pos[0].item(), - seqlen, - None, # Attention mask - 0, # dropout probability. Ignored by the code - True, # is_causal - ) + k_cache, v_cache = self.kv_cache.update(input_pos, k, v) + output = torch.ops.llama.custom_sdpa( + q, + k_cache, + v_cache, + input_pos[0].item(), + None, # Attention mask + 0, # dropout probability. Ignored by the code + True, # is_causal + ) return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype) @@ -106,7 +89,6 @@ def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module: class SDPASimple(torch.nn.Module): - def __init__( self, kv_cache: KVCache, @@ -166,7 +148,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class SDPAFlex(torch.nn.Module): - def __init__( self, kv_cache: KVCache, diff --git a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py index 21952d8c21..57c36dabf9 100644 --- a/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py @@ -11,6 +11,7 @@ from executorch.examples.models.llama.llama_transformer import KVCache from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( + CustomKVCache, QuantizedCacheType, QuantizedKVCache, ) @@ -19,7 +20,6 @@ class SDPAWithQuantizedKVCacheTest(unittest.TestCase): - def _init_cache(self): self.kv_cache = KVCache( self.max_batch_size, @@ -33,6 +33,19 @@ def _init_cache(self): self.quantized_kv_cache = QuantizedKVCache.from_float( self.kv_cache, QuantizedCacheType.AffineAsymmetric ) + # Need this because first test actually has seq_len > 1 + # and vanilla kvcache cannot handle seq_len > 1, due to + # how input_pos encoding works in the current stack. + # This needs fixing by making sure rest of the stack including + # custom ops or other backends can work with input_pos + # as a sequence of token positions + self.custom_kv_cache = CustomKVCache( + self.max_batch_size, + self.max_seq_len, + self.n_kv_heads, + self.head_dim, + dtype=self.dtype, + ) def _init_kv(self): kv_shape = (1, self.seq_len, self.n_kv_heads, self.head_dim) @@ -59,7 +72,7 @@ def test_simple(self, is_dynamic_shape=False): self.seq_len = 3 self._init_cache() q, k, v = self._init_kv() - self.float_sdpa = SDPACustom(self.kv_cache, self.dim) + self.float_sdpa = SDPACustom(self.custom_kv_cache, self.dim) self.quantized_sdpa = SDPACustom(self.quantized_kv_cache, self.dim) float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None) quantized_out = self.quantized_sdpa(input_pos, q, k, v, 1, self.seq_len, None) From 745f17e27590677638bd838ce8a258097acf3e21 Mon Sep 17 00:00:00 2001 From: Hardik Sharma Date: Thu, 16 Jan 2025 10:31:39 -0800 Subject: [PATCH 22/40] Fix Graph builder for higher order ops. Differential Revision: D68231732 Pull Request resolved: https://github.com/pytorch/executorch/pull/7684 --- backends/cadence/aot/graph_builder.py | 19 ++++++++++- .../cadence/aot/tests/test_graph_builder.py | 33 ++++++++++++++++++- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py index 88ed2ac769..fc9441891a 100644 --- a/backends/cadence/aot/graph_builder.py +++ b/backends/cadence/aot/graph_builder.py @@ -6,7 +6,8 @@ from typing import Optional, Sequence, Union import torch -from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue +from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue +from torch._dispatch.python import enable_python_dispatcher from torch._subclasses import FakeTensor, FakeTensorMode from torch.fx.node import Argument, Target from torch.utils import _pytree as pytree @@ -80,6 +81,22 @@ def call_operator( kwargs = {} return super().call_operator(op, args, kwargs, meta) + def call_submodule( + self, graph_module: torch.fx.GraphModule, inputs: tuple[Argument, ...] + ) -> PassResult: + return ExportPass().call(graph_module) + + def _fx( + self, + kind: str, + target: torch.fx.node.Target, + args: tuple[Argument, ...], + kwargs: dict[str, Argument], + meta: NodeMetadata, + ) -> ProxyValue: + with self.fake_tensor_mode, enable_python_dispatcher(): + return super()._fx(kind, target, args, kwargs, meta) + def single_op_builder( placeholders: Sequence[Union[torch.Tensor, FakeTensor]], diff --git a/backends/cadence/aot/tests/test_graph_builder.py b/backends/cadence/aot/tests/test_graph_builder.py index 04097c1725..ebef97be52 100644 --- a/backends/cadence/aot/tests/test_graph_builder.py +++ b/backends/cadence/aot/tests/test_graph_builder.py @@ -1,5 +1,9 @@ # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# pyre-strict + + +from typing import Sequence import executorch.backends.cadence.aot.ops_registrations # noqa import torch @@ -9,7 +13,7 @@ ) from executorch.backends.cadence.aot.pass_utils import count_node from executorch.exir.dialects._ops import ops as exir_ops -from executorch.exir.pass_base import ExportPass +from executorch.exir.pass_base import ExportPass, NodeMetadata from later.unittest import TestCase @@ -68,3 +72,30 @@ def test_graph_with_single_im2row(self) -> None: # Check graph has a single im2row node. self.assertEqual(len([gm.graph.nodes]), 1) self.assertEqual(count_node(gm, exir_ops.edge.cadence.im2row.default), 1) + + +class TestHigherOrderOps(TestCase): + def _get_inner_graph(self, x_shape: Sequence[int]) -> torch.fx.GraphModule: + builder = GraphBuilder() + x = builder.placeholder("x", torch.randn(*x_shape)) + add = builder.call_operator( + exir_ops.edge.aten.add.Tensor, + (x, x), # pyre-ignore + ) + builder.output([x, add]) + gm = builder.get_graph_module() + # Check if graph module is valid by running exportpass on it. + gm = ExportPass().call(gm).graph_module + return gm + + def test_call_map(self) -> None: + builder = GraphBuilder() + x_shape = (4, 8, 8) + x = builder.placeholder("x", torch.randn(*x_shape)) + map_node = builder.call_map( + self._get_inner_graph(x_shape[1:]), [x], [], NodeMetadata({}) + ) + builder.output([map_node]) + gm = builder.get_graph_module() + # Check if graph module is valid by running exportpass on it. + ExportPass().call(gm).graph_module From b8180626c129f9f11b33e4424aca7dfee6fca9df Mon Sep 17 00:00:00 2001 From: cccclai Date: Thu, 16 Jan 2025 13:03:04 -0800 Subject: [PATCH 23/40] add qnn test template job (#7636) * add qnn test template job (#7636) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/7636 Differential Revision: D68112936 * Increase timeout --------- Co-authored-by: Huy Do --- .github/workflows/pull.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index d1b64e7598..8b32e46cf2 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -395,6 +395,25 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}" + test-qnn-models-linux: + name: test-qnn-models-linux + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + fail-fast: false + with: + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-qnn-sdk + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 180 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + # placeholder for running test_qnn_delegate.py, can use matrix such that we can trigger different jobs, refers to test-llama-runner-qnn-linux + # reminder: make sure each job runs fast + test-phi-3-mini-runner-linux: name: test-phi-3-mini-runner-linux uses: pytorch/test-infra/.github/workflows/linux_job.yml@main From 9c043290ad3944268290e015c3063bc411e6ef6b Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Thu, 16 Jan 2025 13:07:16 -0800 Subject: [PATCH 24/40] support half and bf16 in to_dim_order_copy (#7689) Differential Revision: D68245619 Pull Request resolved: https://github.com/pytorch/executorch/pull/7693 --- .../portable/cpu/op__to_dim_order_copy.cpp | 8 ++++++-- kernels/test/op__to_dim_order_copy_test.cpp | 19 +++++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp index 31dd4fbb9d..bcbf6cc132 100644 --- a/kernels/portable/cpu/op__to_dim_order_copy.cpp +++ b/kernels/portable/cpu/op__to_dim_order_copy.cpp @@ -96,13 +96,17 @@ Tensor& _to_dim_order_copy_out( InvalidArgument, out); - ET_SWITCH_REALHB_TYPES( + if (self.numel() == 0) { + return out; + } + + ET_SWITCH_REALHBBF16_TYPES( self.scalar_type(), ctx, "dim_order_ops::_to_dim_order_copy.out", CTYPE_IN, [&] { - ET_SWITCH_REALHB_TYPES( + ET_SWITCH_REALHBBF16_TYPES( out.scalar_type(), ctx, "dim_order_ops::_to_dim_order_copy.out", diff --git a/kernels/test/op__to_dim_order_copy_test.cpp b/kernels/test/op__to_dim_order_copy_test.cpp index e888e0fc7f..073225a7d6 100644 --- a/kernels/test/op__to_dim_order_copy_test.cpp +++ b/kernels/test/op__to_dim_order_copy_test.cpp @@ -36,7 +36,9 @@ typedef std::map< std::type_index, std::variant< std::vector, - std::vector>> + std::vector, + std::vector, + std::vector>> FloatingTypeToDataMap; typedef std::map< @@ -381,9 +383,9 @@ TEST_F(OpToDimOrderCopyTest, NanInfSupported) { ScalarType::OUTPUT_DTYPE>(test_cases); #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ - ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); + ET_FORALL_FLOATHBF16_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_FLOAT_TYPES(TEST_ENTRY); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); #undef TEST_ENTRY #undef TEST_KERNEL @@ -413,6 +415,13 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) { -0.30919688936285893988}; // clang-format on + std::vector half_data; + std::vector bf16_data; + for (auto d : double_data) { + half_data.emplace_back(d); + bf16_data.emplace_back(d); + } + std::vector int64_data = { -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0}; std::vector int32_data = { @@ -426,6 +435,8 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) { FloatingTypeToDataMap floating_point_data; floating_point_data[typeid(float)] = float_data; floating_point_data[typeid(double)] = double_data; + floating_point_data[typeid(exec_aten::Half)] = half_data; + floating_point_data[typeid(exec_aten::BFloat16)] = bf16_data; // Gathering all int data together for better traversial IntTypeToDataMap int_data; @@ -444,7 +455,7 @@ TEST_F(OpToDimOrderCopyTest, HardcodeFloatConvertInt) { #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \ ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL); - ET_FORALL_FLOAT_TYPES(TEST_ENTRY); + ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY); } TEST_F(OpToDimOrderCopyTest, MismatchedSizesDie) { From 1b7b10efd1d61f4e96a129ea416dcc6bcc2c3f2c Mon Sep 17 00:00:00 2001 From: Nicholas Long <19273992+cptspacemanspiff@users.noreply.github.com> Date: Thu, 16 Jan 2025 17:07:36 -0500 Subject: [PATCH 25/40] Fixed always rebuild issue in cmake. (#7512) * Fixed always rebuild issue in cmake. The generated files were located in include/executorch/schema/program_generated.h CMake was expecting files in include/executorch/program_generated.h Presumably this was a change at some point, and the expected output from cmake never got updated. --- schema/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/schema/CMakeLists.txt b/schema/CMakeLists.txt index 5a4013f43e..64f8821da1 100644 --- a/schema/CMakeLists.txt +++ b/schema/CMakeLists.txt @@ -15,7 +15,7 @@ endif() # The include directory that will contain the generated schema headers. set(_program_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include") - +set(_program_schema__output_dir "${_program_schema__include_dir}/executorch/schema") # Source root directory for executorch. if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) @@ -26,7 +26,7 @@ function(generate_program_schema _schema_srcs _schema_name) foreach(fbs_file ${_schema_srcs}) string(REGEX REPLACE "[.]fbs$" "_generated.h" generated "${fbs_file}") list(APPEND _schema_outputs - "${_program_schema__include_dir}/executorch/${generated}" + "${_program_schema__output_dir}/${generated}" ) endforeach() @@ -35,7 +35,7 @@ function(generate_program_schema _schema_srcs _schema_name) OUTPUT ${_schema_outputs} COMMAND ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --gen-mutable --scoped-enums -o - "${_program_schema__include_dir}/executorch/schema" ${_schema_srcs} + "${_program_schema__output_dir}" ${_schema_srcs} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${FLATC_EXECUTABLE} ${_schema_srcs} COMMENT "Generating ${_schema_name} headers" From 2f0518d2cfb4ee4353dce4e39590de43fa391399 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Thu, 16 Jan 2025 18:03:54 -0600 Subject: [PATCH 26/40] [xnnpack] Reexport after quantize in aot_compiler (#7714) --- examples/xnnpack/aot_compiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py index c3538db4d8..e1542245ac 100644 --- a/examples/xnnpack/aot_compiler.py +++ b/examples/xnnpack/aot_compiler.py @@ -92,6 +92,7 @@ logging.info("Quantizing Model...") # TODO(T165162973): This pass shall eventually be folded into quantizer model = quantize(model, example_inputs) + ep = torch.export.export_for_training(model, example_inputs) edge = to_edge_transform_and_lower( ep, From 007ea3ed0cec116f669cca389a5f23679f7da74c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 16 Jan 2025 16:37:29 -0800 Subject: [PATCH 27/40] install_requirements.py: use argparse, minor cleanup (#7703) Replace the bespoke argument parser with argparse in preparation for separating requirements installation from building and installing ExecuTorch itself. --- install_requirements.py | 347 ++++++++++++++++++++++------------------ 1 file changed, 188 insertions(+), 159 deletions(-) diff --git a/install_requirements.py b/install_requirements.py index adb26170cd..c16cacca46 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -6,7 +6,9 @@ # LICENSE file in the root directory of this source tree. +import argparse import glob +import itertools import os import platform import re @@ -63,174 +65,201 @@ def python_is_compatible(): return True -if not python_is_compatible(): - sys.exit(1) +def clean(): + print("Cleaning build artifacts...") + print("Cleaning pip-out/...") + shutil.rmtree("pip-out/", ignore_errors=True) + dirs = glob.glob("cmake-out*/") + glob.glob("cmake-android-out/") + for d in dirs: + print(f"Cleaning {d}...") + shutil.rmtree(d, ignore_errors=True) + print("Done cleaning build artifacts.") -# Parse options. -EXECUTORCH_BUILD_PYBIND = "" -CMAKE_ARGS = os.getenv("CMAKE_ARGS", "") -CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "") -USE_PYTORCH_NIGHTLY = True +VALID_PYBINDS = ["coreml", "mps", "xnnpack"] -args = sys.argv[1:] -for arg in args: - if arg == "--pybind": - pass - elif arg in ["coreml", "mps", "xnnpack"]: - if "--pybind" in args: - arg_upper = arg.upper() - EXECUTORCH_BUILD_PYBIND = "ON" - CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{arg_upper}=ON" - else: - print(f"Error: {arg} must follow --pybind") - sys.exit(1) - elif arg == "off": - if "--pybind" in args: - if EXECUTORCH_BUILD_PYBIND == "ON": - print("Cannot turnoff pybind option as it is already set.") - sys.exit(1) + +def main(args): + if not python_is_compatible(): + sys.exit(1) + + # Parse options. + + EXECUTORCH_BUILD_PYBIND = "" + CMAKE_ARGS = os.getenv("CMAKE_ARGS", "") + CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "") + USE_PYTORCH_NIGHTLY = True + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pybind", + action="append", + nargs="+", + help="one or more of coreml/mps/xnnpack, or off", + ) + parser.add_argument( + "--clean", + action="store_true", + help="clean build artifacts and pip-out instead of installing", + ) + parser.add_argument( + "--use-pt-pinned-commit", + action="store_true", + help="build from the pinned PyTorch commit instead of nightly", + ) + args = parser.parse_args(args) + + if args.clean: + clean() + return + + if args.pybind: + # Flatten list of lists. + args.pybind = list(itertools.chain(*args.pybind)) + if "off" in args.pybind: + if len(args.pybind) != 1: + raise Exception( + f"Cannot combine `off` with other pybinds: {args.pybind}" + ) EXECUTORCH_BUILD_PYBIND = "OFF" else: - print(f"Error: {arg} must follow --pybind") - sys.exit(1) - - elif arg == "--clean": - print("Cleaning build artifacts...") - print("Cleaning pip-out/...") - shutil.rmtree("pip-out/", ignore_errors=True) - dirs = glob.glob("cmake-out*/") + glob.glob("cmake-android-out/") - for d in dirs: - print(f"Cleaning {d}...") - shutil.rmtree(d, ignore_errors=True) - print("Done cleaning build artifacts.") - sys.exit(0) - elif arg == "--use-pt-pinned-commit": + for pybind_arg in args.pybind: + if pybind_arg not in VALID_PYBINDS: + raise Exception( + f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}" + ) + EXECUTORCH_BUILD_PYBIND = "ON" + CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON" + + if args.use_pt_pinned_commit: # This option is used in CI to make sure that PyTorch build from the pinned commit # is used instead of nightly. CI jobs wouldn't be able to catch regression from the # latest PT commit otherwise USE_PYTORCH_NIGHTLY = False - else: - print(f"Error: Unknown option {arg}") - sys.exit(1) -# If --pybind is not set explicitly for backends (e.g., --pybind xnnpack) -# or is not turned off explicitly (--pybind off) -# then install XNNPACK by default. -if EXECUTORCH_BUILD_PYBIND == "": - EXECUTORCH_BUILD_PYBIND = "ON" - CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON" - -# Use ClangCL on Windows. -# ClangCL is an alias to Clang that configures it to work in an MSVC-compatible -# mode. Using it on Windows to avoid compiler compatibility issues for MSVC. -if os.name == "nt": - CMAKE_ARGS += " -T ClangCL" - -# Since ExecuTorch often uses main-branch features of pytorch, only the nightly -# pip versions will have the required features. -# -# NOTE: If a newly-fetched version of the executorch repo changes the value of -# NIGHTLY_VERSION, you should re-run this script to install the necessary -# package versions. -NIGHTLY_VERSION = "dev20250104" - -# The pip repository that hosts nightly torch packages. -TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" - -# pip packages needed by exir. -EXIR_REQUIREMENTS = [ - # Setting USE_PYTORCH_NIGHTLY to false to test the pinned PyTorch commit. Note - # that we don't need to set any version number there because they have already - # been installed on CI before this step, so pip won't reinstall them - f"torch==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torch", - ( - f"torchvision==0.22.0.{NIGHTLY_VERSION}" - if USE_PYTORCH_NIGHTLY - else "torchvision" - ), # For testing. - "typing-extensions", -] - -# pip packages needed to run examples. -# TODO: Make each example publish its own requirements.txt -EXAMPLES_REQUIREMENTS = [ - "timm==1.0.7", - f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio", - "torchsr==1.0.4", - "transformers==4.47.1", -] - -# pip packages needed for development. -DEVEL_REQUIREMENTS = [ - "cmake", # For building binary targets. - "pip>=23", # For building the pip package. - "pyyaml", # Imported by the kernel codegen tools. - "setuptools>=63", # For building the pip package. - "tomli", # Imported by extract_sources.py when using python < 3.11. - "wheel", # For building the pip package archive. - "zstd", # Imported by resolve_buck.py. -] - -# Assemble the list of requirements to actually install. -# TODO: Add options for reducing the number of requirements. -REQUIREMENTS_TO_INSTALL = EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS - -# Install the requirements. `--extra-index-url` tells pip to look for package -# versions on the provided URL if they aren't available on the default URL. -subprocess.run( - [ - sys.executable, - "-m", - "pip", - "install", - *REQUIREMENTS_TO_INSTALL, - "--extra-index-url", - TORCH_NIGHTLY_URL, - ], - check=True, -) - -LOCAL_REQUIREMENTS = [ - "third-party/ao", # We need the latest kernels for fast iteration, so not relying on pypi. -] - -# Install packages directly from local copy instead of pypi. -# This is usually not recommended. -subprocess.run( - [ - sys.executable, - "-m", - "pip", - "install", - *LOCAL_REQUIREMENTS, - ], - check=True, -) + # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack) + # or is not turned off explicitly (--pybind off) + # then install XNNPACK by default. + if EXECUTORCH_BUILD_PYBIND == "": + EXECUTORCH_BUILD_PYBIND = "ON" + CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON" + + # Use ClangCL on Windows. + # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible + # mode. Using it on Windows to avoid compiler compatibility issues for MSVC. + if os.name == "nt": + CMAKE_ARGS += " -T ClangCL" + + # Since ExecuTorch often uses main-branch features of pytorch, only the nightly + # pip versions will have the required features. + # + # NOTE: If a newly-fetched version of the executorch repo changes the value of + # NIGHTLY_VERSION, you should re-run this script to install the necessary + # package versions. + NIGHTLY_VERSION = "dev20250104" + + # The pip repository that hosts nightly torch packages. + TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" + + # pip packages needed by exir. + EXIR_REQUIREMENTS = [ + # Setting USE_PYTORCH_NIGHTLY to false to test the pinned PyTorch commit. Note + # that we don't need to set any version number there because they have already + # been installed on CI before this step, so pip won't reinstall them + f"torch==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torch", + ( + f"torchvision==0.22.0.{NIGHTLY_VERSION}" + if USE_PYTORCH_NIGHTLY + else "torchvision" + ), # For testing. + "typing-extensions", + ] + + # pip packages needed to run examples. + # TODO: Make each example publish its own requirements.txt + EXAMPLES_REQUIREMENTS = [ + "timm==1.0.7", + f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio", + "torchsr==1.0.4", + "transformers==4.47.1", + ] + + # pip packages needed for development. + DEVEL_REQUIREMENTS = [ + "cmake", # For building binary targets. + "pip>=23", # For building the pip package. + "pyyaml", # Imported by the kernel codegen tools. + "setuptools>=63", # For building the pip package. + "tomli", # Imported by extract_sources.py when using python < 3.11. + "wheel", # For building the pip package archive. + "zstd", # Imported by resolve_buck.py. + ] + + # Assemble the list of requirements to actually install. + # TODO: Add options for reducing the number of requirements. + REQUIREMENTS_TO_INSTALL = ( + EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS + ) + + # Install the requirements. `--extra-index-url` tells pip to look for package + # versions on the provided URL if they aren't available on the default URL. + subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + *REQUIREMENTS_TO_INSTALL, + "--extra-index-url", + TORCH_NIGHTLY_URL, + ], + check=True, + ) + + LOCAL_REQUIREMENTS = [ + "third-party/ao", # We need the latest kernels for fast iteration, so not relying on pypi. + ] + + # Install packages directly from local copy instead of pypi. + # This is usually not recommended. + subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + *LOCAL_REQUIREMENTS, + ], + check=True, + ) + + # + # Install executorch pip package. This also makes `flatc` available on the path. + # The --extra-index-url may be necessary if pyproject.toml has a dependency on a + # pre-release or nightly version of a torch package. + # + + # Set environment variables + os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND + os.environ["CMAKE_ARGS"] = CMAKE_ARGS + os.environ["CMAKE_BUILD_ARGS"] = CMAKE_BUILD_ARGS + + # Run the pip install command + subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + ".", + "--no-build-isolation", + "-v", + "--extra-index-url", + TORCH_NIGHTLY_URL, + ], + check=True, + ) -# -# Install executorch pip package. This also makes `flatc` available on the path. -# The --extra-index-url may be necessary if pyproject.toml has a dependency on a -# pre-release or nightly version of a torch package. -# -# Set environment variables -os.environ["EXECUTORCH_BUILD_PYBIND"] = EXECUTORCH_BUILD_PYBIND -os.environ["CMAKE_ARGS"] = CMAKE_ARGS -os.environ["CMAKE_BUILD_ARGS"] = CMAKE_BUILD_ARGS - -# Run the pip install command -subprocess.run( - [ - sys.executable, - "-m", - "pip", - "install", - ".", - "--no-build-isolation", - "-v", - "--extra-index-url", - TORCH_NIGHTLY_URL, - ], - check=True, -) +if __name__ == "__main__": + main(sys.argv[1:]) From 9f47380ab5d4818270a7ea2eac13d9a4caa76dd0 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 16 Jan 2025 16:41:28 -0800 Subject: [PATCH 28/40] install_requirements.py: refactor: extract install_requirements() function (#7704) More preparation for separating installation of requirements from installation of ExecuTorch. Test Plan: ./install_requirements.sh in a fresh venv succeeded and reported installing executorch --- install_requirements.py | 169 +++++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 82 deletions(-) diff --git a/install_requirements.py b/install_requirements.py index c16cacca46..409460ca10 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -79,97 +79,29 @@ def clean(): VALID_PYBINDS = ["coreml", "mps", "xnnpack"] -def main(args): - if not python_is_compatible(): - sys.exit(1) - - # Parse options. - - EXECUTORCH_BUILD_PYBIND = "" - CMAKE_ARGS = os.getenv("CMAKE_ARGS", "") - CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "") - USE_PYTORCH_NIGHTLY = True - - parser = argparse.ArgumentParser() - parser.add_argument( - "--pybind", - action="append", - nargs="+", - help="one or more of coreml/mps/xnnpack, or off", - ) - parser.add_argument( - "--clean", - action="store_true", - help="clean build artifacts and pip-out instead of installing", - ) - parser.add_argument( - "--use-pt-pinned-commit", - action="store_true", - help="build from the pinned PyTorch commit instead of nightly", - ) - args = parser.parse_args(args) - - if args.clean: - clean() - return - - if args.pybind: - # Flatten list of lists. - args.pybind = list(itertools.chain(*args.pybind)) - if "off" in args.pybind: - if len(args.pybind) != 1: - raise Exception( - f"Cannot combine `off` with other pybinds: {args.pybind}" - ) - EXECUTORCH_BUILD_PYBIND = "OFF" - else: - for pybind_arg in args.pybind: - if pybind_arg not in VALID_PYBINDS: - raise Exception( - f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}" - ) - EXECUTORCH_BUILD_PYBIND = "ON" - CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON" +# The pip repository that hosts nightly torch packages. +TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" - if args.use_pt_pinned_commit: - # This option is used in CI to make sure that PyTorch build from the pinned commit - # is used instead of nightly. CI jobs wouldn't be able to catch regression from the - # latest PT commit otherwise - USE_PYTORCH_NIGHTLY = False - # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack) - # or is not turned off explicitly (--pybind off) - # then install XNNPACK by default. - if EXECUTORCH_BUILD_PYBIND == "": - EXECUTORCH_BUILD_PYBIND = "ON" - CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON" - - # Use ClangCL on Windows. - # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible - # mode. Using it on Windows to avoid compiler compatibility issues for MSVC. - if os.name == "nt": - CMAKE_ARGS += " -T ClangCL" - - # Since ExecuTorch often uses main-branch features of pytorch, only the nightly - # pip versions will have the required features. - # - # NOTE: If a newly-fetched version of the executorch repo changes the value of - # NIGHTLY_VERSION, you should re-run this script to install the necessary - # package versions. - NIGHTLY_VERSION = "dev20250104" +# Since ExecuTorch often uses main-branch features of pytorch, only the nightly +# pip versions will have the required features. +# +# NOTE: If a newly-fetched version of the executorch repo changes the value of +# NIGHTLY_VERSION, you should re-run this script to install the necessary +# package versions. +NIGHTLY_VERSION = "dev20250104" - # The pip repository that hosts nightly torch packages. - TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" +def install_requirements(use_pytorch_nightly): # pip packages needed by exir. EXIR_REQUIREMENTS = [ - # Setting USE_PYTORCH_NIGHTLY to false to test the pinned PyTorch commit. Note + # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note # that we don't need to set any version number there because they have already # been installed on CI before this step, so pip won't reinstall them - f"torch==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torch", + f"torch==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", ( f"torchvision==0.22.0.{NIGHTLY_VERSION}" - if USE_PYTORCH_NIGHTLY + if use_pytorch_nightly else "torchvision" ), # For testing. "typing-extensions", @@ -179,7 +111,7 @@ def main(args): # TODO: Make each example publish its own requirements.txt EXAMPLES_REQUIREMENTS = [ "timm==1.0.7", - f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if USE_PYTORCH_NIGHTLY else "torchaudio", + f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio", "torchsr==1.0.4", "transformers==4.47.1", ] @@ -233,6 +165,79 @@ def main(args): check=True, ) + +def main(args): + if not python_is_compatible(): + sys.exit(1) + + # Parse options. + + EXECUTORCH_BUILD_PYBIND = "" + CMAKE_ARGS = os.getenv("CMAKE_ARGS", "") + CMAKE_BUILD_ARGS = os.getenv("CMAKE_BUILD_ARGS", "") + use_pytorch_nightly = True + + parser = argparse.ArgumentParser() + parser.add_argument( + "--pybind", + action="append", + nargs="+", + help="one or more of coreml/mps/xnnpack, or off", + ) + parser.add_argument( + "--clean", + action="store_true", + help="clean build artifacts and pip-out instead of installing", + ) + parser.add_argument( + "--use-pt-pinned-commit", + action="store_true", + help="build from the pinned PyTorch commit instead of nightly", + ) + args = parser.parse_args(args) + if args.pybind: + # Flatten list of lists. + args.pybind = list(itertools.chain(*args.pybind)) + if "off" in args.pybind: + if len(args.pybind) != 1: + raise Exception( + f"Cannot combine `off` with other pybinds: {args.pybind}" + ) + EXECUTORCH_BUILD_PYBIND = "OFF" + else: + for pybind_arg in args.pybind: + if pybind_arg not in VALID_PYBINDS: + raise Exception( + f"Unrecognized pybind argument {pybind_arg}; valid options are: {', '.join(VALID_PYBINDS)}" + ) + EXECUTORCH_BUILD_PYBIND = "ON" + CMAKE_ARGS += f" -DEXECUTORCH_BUILD_{pybind_arg.upper()}=ON" + + if args.clean: + clean() + return + + if args.use_pt_pinned_commit: + # This option is used in CI to make sure that PyTorch build from the pinned commit + # is used instead of nightly. CI jobs wouldn't be able to catch regression from the + # latest PT commit otherwise + use_pytorch_nightly = False + + install_requirements(use_pytorch_nightly) + + # If --pybind is not set explicitly for backends (e.g., --pybind xnnpack) + # or is not turned off explicitly (--pybind off) + # then install XNNPACK by default. + if EXECUTORCH_BUILD_PYBIND == "": + EXECUTORCH_BUILD_PYBIND = "ON" + CMAKE_ARGS += " -DEXECUTORCH_BUILD_XNNPACK=ON" + + # Use ClangCL on Windows. + # ClangCL is an alias to Clang that configures it to work in an MSVC-compatible + # mode. Using it on Windows to avoid compiler compatibility issues for MSVC. + if os.name == "nt": + CMAKE_ARGS += " -T ClangCL" + # # Install executorch pip package. This also makes `flatc` available on the path. # The --extra-index-url may be necessary if pyproject.toml has a dependency on a From fbb0395110724717c42720582bb8804b752241e3 Mon Sep 17 00:00:00 2001 From: Dave Bort Date: Thu, 16 Jan 2025 17:59:38 -0800 Subject: [PATCH 29/40] Validate tensor sizes during method load Differential Revision: D68180029 Pull Request resolved: https://github.com/pytorch/executorch/pull/7663 --- runtime/executor/tensor_parser_portable.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp index 3f190060f7..79e4c4bd96 100644 --- a/runtime/executor/tensor_parser_portable.cpp +++ b/runtime/executor/tensor_parser_portable.cpp @@ -101,6 +101,19 @@ Result parseTensor( sizes = const_cast(serialized_sizes); dim_order = const_cast(serialized_dim_order); } + // Validate sizes before using them in case the PTE data is bad. We can't + // detect bad positive values, but we can reject negative values, which would + // otherwise panic in the TensorImpl ctor. dim_order_to_stride() will validate + // dim_order. + for (int i = 0; i < dim; i++) { + ET_CHECK_OR_RETURN_ERROR( + sizes[i] >= 0, + InvalidProgram, + "Negative size[%d] %" PRId32, + i, + sizes[i]); + } + // We will remove strides from schema. // Allocating strides buffer here and populating it. // In subsequent diffs we can remove strides accessor, however this From 1a6b7a6f14c75d87b21c4fc517b0d7c0fe17f761 Mon Sep 17 00:00:00 2001 From: JP <46308822+zonglinpeng@users.noreply.github.com> Date: Thu, 16 Jan 2025 22:08:59 -0800 Subject: [PATCH 30/40] refactor test targets Differential Revision: D68194772 Pull Request resolved: https://github.com/pytorch/executorch/pull/7673 --- examples/cadence/operators/TARGETS | 25 ++----------- examples/cadence/operators/targets.bzl | 36 +++++++++++++++++++ ...nv1d_op.py => test_quantized_conv1d_op.py} | 4 ++- ...near_op.py => test_quantized_linear_op.py} | 0 4 files changed, 41 insertions(+), 24 deletions(-) create mode 100644 examples/cadence/operators/targets.bzl rename examples/cadence/operators/{quantized_conv1d_op.py => test_quantized_conv1d_op.py} (93%) rename examples/cadence/operators/{quantized_linear_op.py => test_quantized_linear_op.py} (100%) diff --git a/examples/cadence/operators/TARGETS b/examples/cadence/operators/TARGETS index 732f1ced09..67f2bab681 100644 --- a/examples/cadence/operators/TARGETS +++ b/examples/cadence/operators/TARGETS @@ -1,26 +1,5 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load("targets.bzl", "define_common_targets") oncall("odai_jarvis") - -python_unittest( - name = "test_add_op", - srcs = [ - "test_add_op.py", - ], - typing = True, - supports_static_listing = False, - deps = [ - "fbsource//third-party/pypi/parameterized:parameterized", - "//caffe2:torch", - "//executorch/backends/cadence/aot:ops_registrations", - "//executorch/backends/cadence/aot:export_example", - "//executorch/backends/cadence/aot:compiler", - ], -) +define_common_targets() diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl new file mode 100644 index 0000000000..e1fbeb9fdf --- /dev/null +++ b/examples/cadence/operators/targets.bzl @@ -0,0 +1,36 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +TESTS_LIST = [ + "add_op", + "quantized_conv1d_op", + "quantized_linear_op", +] + +def define_common_targets(): + for op in TESTS_LIST: + _define_test_target(op) + + +def _define_test_target(test_name): + file_name = "test_{}".format(test_name) + python_unittest( + name = file_name, + srcs = [ + "{}.py".format(file_name), + ], + typing = True, + supports_static_listing = False, + deps = [ + "fbsource//third-party/pypi/parameterized:parameterized", + "fbcode//caffe2:torch", + "fbcode//executorch/backends/cadence/aot:ops_registrations", + "fbcode//executorch/backends/cadence/aot:export_example", + "fbcode//executorch/backends/cadence/aot:compiler", + ], + ) diff --git a/examples/cadence/operators/quantized_conv1d_op.py b/examples/cadence/operators/test_quantized_conv1d_op.py similarity index 93% rename from examples/cadence/operators/quantized_conv1d_op.py rename to examples/cadence/operators/test_quantized_conv1d_op.py index 3247cb690d..e2457077b2 100644 --- a/examples/cadence/operators/quantized_conv1d_op.py +++ b/examples/cadence/operators/test_quantized_conv1d_op.py @@ -8,6 +8,8 @@ import logging +from typing import cast, Sequence + import torch from executorch.backends.cadence.aot.ops_registrations import * # noqa @@ -53,6 +55,6 @@ def forward(self, x: torch.Tensor): model = QuantizedConv() model.eval() - example_inputs = (torch.randn(shape),) + example_inputs = (torch.randn(cast(Sequence[int], shape)),) export_model(model, example_inputs) diff --git a/examples/cadence/operators/quantized_linear_op.py b/examples/cadence/operators/test_quantized_linear_op.py similarity index 100% rename from examples/cadence/operators/quantized_linear_op.py rename to examples/cadence/operators/test_quantized_linear_op.py From dad73ca6240429e2f79d666547cd61c95c05c427 Mon Sep 17 00:00:00 2001 From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com> Date: Fri, 17 Jan 2025 08:57:38 +0000 Subject: [PATCH 31/40] Fix for multiple outputs in FVP tests (#7650) Fix for multiple outputs in corstone - Update to ensure all output nodes are consumed. - Update to ensure output quant scales are used. --- .../arm/test/misc/test_multiple_outputs.py | 47 ++++++++++- backends/arm/test/runner_utils.py | 79 ++++++++++--------- .../arm/test/tester/analyze_output_utils.py | 8 +- backends/arm/test/tester/arm_tester.py | 37 +++++---- 4 files changed, 114 insertions(+), 57 deletions(-) diff --git a/backends/arm/test/misc/test_multiple_outputs.py b/backends/arm/test/misc/test_multiple_outputs.py index 7762c7dc2f..ddddc94d27 100644 --- a/backends/arm/test/misc/test_multiple_outputs.py +++ b/backends/arm/test/misc/test_multiple_outputs.py @@ -6,9 +6,11 @@ import unittest +import pytest import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester +from executorch.exir.backend.compile_spec_schema import CompileSpec class TestMultipleOutputs(unittest.TestCase): @@ -51,3 +53,46 @@ def test_tosa_BI_pipeline(self): .to_executorch() .run_method_and_compare_outputs(inputs=inputs, qtol=1.0) ) + + def _test_ethosu_BI_pipeline( + self, + module: torch.nn.Module, + test_data: tuple[torch.Tensor], + compile_spec: CompileSpec, + ): + tester = ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=compile_spec, + ) + .quantize() + .export() + .to_edge_transform_and_lower() + .to_executorch() + .serialize() + ) + if conftest.is_option_enabled("corstone_fvp"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) + + @pytest.mark.corstone_fvp + def test_u85_BI(self): + module = self.MultipleOutputsModule() + test_data = module.get_inputs() + self._test_ethosu_BI_pipeline( + module, + test_data, + common.get_u85_compile_spec(), + ) + + @pytest.mark.corstone_fvp + @conftest.expectedFailureOnFVP + # TODO MLETORCH-598 + def test_u55_BI(self): + module = self.MultipleOutputsModule() + test_data = module.get_inputs() + self._test_ethosu_BI_pipeline( + module, + test_data, + common.get_u55_compile_spec(), + ) diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index b206e5585b..3851e41b73 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -115,50 +115,53 @@ def _get_input_quantization_params( return quant_params -def _get_output_node(program: ExportedProgram) -> Node: +def _get_output_nodes(program: ExportedProgram) -> list[Node]: """ Get output node to this model. Args: - program (ExportedProgram): The program to get output node from. + program (ExportedProgram): The program to get the output nodes from. Returns: - The node that is the output of 'program'. + The nodes that are the outputs of the 'program'. """ - + output_nodes = [] for node in program.graph.nodes: if node.op == "output": - return node - raise RuntimeError("No output node found.") + for output in node.args[0]: + output_nodes.append(output) + if len(output_nodes) == 0: + raise RuntimeError("No output nodes found.") + else: + return output_nodes def _get_output_quantization_params( - program: ExportedProgram, output_node: Node -) -> Optional[QuantizationParams]: + output_nodes: list[Node], +) -> List[QuantizationParams]: """ Get output QuantizationParams from a program. Args: - program (ExportedProgram): The program to get output quantization parameters from. + output_nodes (list(Node)): A list of output nodes to get output quantization parameters from. Returns: QuantizationParams: The found quantization parameters. Raises: RuntimeError if no output quantization parameters are found. """ - - quant_params = None - for node in program.graph.nodes: - if ( - node.target == torch.ops.quantized_decomposed.dequantize_per_tensor.default - and node == output_node.args[0][0] - ): - quant_params = QuantizationParams( - node_name=node.args[0].name, - scale=node.args[1], - zp=node.args[2], - qmin=node.args[3], - qmax=node.args[4], - dtype=node.args[5], + quant_params = [] + for node in output_nodes: + if node.target == torch.ops.quantized_decomposed.dequantize_per_tensor.default: + quant_params.append( + QuantizationParams( + node_name=node.args[0].name, + scale=node.args[1], + zp=node.args[2], + qmin=node.args[3], + qmax=node.args[4], + dtype=node.args[5], + ) ) - break # break early, there's only one output node + if len(quant_params) == 0: + raise RuntimeError("No Quantization parameters not found in exported model.") return quant_params @@ -211,7 +214,7 @@ def __init__( self.input_names: list[str] = None self.output_name: str = None self.qp_input: list[QuantizationParams] = None - self.qp_output: QuantizationParams = None + self.qp_output: list[QuantizationParams] = None self.timeout = 480 self.target_board: str = None @@ -226,19 +229,17 @@ def init_run( ): self.input_names = _get_input_names(edge_program) - self.output_node = _get_output_node(exported_program) - self.output_name = self.output_node.name + self.output_nodes = _get_output_nodes(exported_program) + self.is_quantized = is_quantized self.target_board = target_board if is_quantized: self.qp_input = _get_input_quantization_params(exported_program) - self.qp_output = _get_output_quantization_params( - exported_program, self.output_node - ) + self.qp_output = _get_output_quantization_params(self.output_nodes) else: self.qp_input = [None] * len(self.input_names) - self.qp_output = None + self.qp_output = [None] * len(self.output_nodes) self._has_init_run = True @@ -265,7 +266,7 @@ def run_corstone( save_bytes(self.intermediate_path, data, False, input_name, quant_param) out_path = os.path.join(self.intermediate_path, "out") - out_path_with_suffix = out_path + "-0.bin" + input_paths = [] for name in self.input_names: input_paths.append( @@ -281,6 +282,7 @@ def run_corstone( ), f"Did not find build arm_executor_runner in path {elf_path}, run setup_testing.sh?" cmd_line = f"executor_runner -m {pte_path} -o {out_path}" + for input_path in input_paths: cmd_line += f" -i {input_path}" @@ -362,11 +364,14 @@ def run_corstone( raise RuntimeError( f"Corstone simulation failed:\ncmd: {command_args[self.target_board]}\n, log: \n {result_stdout}\n{result.stderr.decode()}" ) - - tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32) - output_shape = self.output_node.args[0][0].meta["val"].shape - tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape) - return tosa_ref_output + output_np = [] + for i, node in enumerate(self.output_nodes): + tosa_ref_output = np.fromfile( + os.path.join(self.intermediate_path, f"out-{i}.bin"), dtype=np.float32 + ) + output_shape = node.meta["val"].shape + output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape)) + return tuple(output_np) def run_tosa_graph( self, graph: TosaGraph, inputs: list[np.ndarray] | list[torch.Tensor] diff --git a/backends/arm/test/tester/analyze_output_utils.py b/backends/arm/test/tester/analyze_output_utils.py index d70f86c4f2..477a96652f 100644 --- a/backends/arm/test/tester/analyze_output_utils.py +++ b/backends/arm/test/tester/analyze_output_utils.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -9,7 +9,7 @@ import torch from executorch.backends.arm.test.runner_utils import ( _get_input_quantization_params, - _get_output_node, + _get_output_nodes, _get_output_quantization_params, ) @@ -228,9 +228,9 @@ def dump_error_output( export_stage = tester.stages.get(tester.stage_name(Export), None) quantize_stage = tester.stages.get(tester.stage_name(Quantize), None) if export_stage is not None and quantize_stage is not None: - output_node = _get_output_node(export_stage.artifact) + output_nodes = _get_output_nodes(export_stage.artifact) qp_input = _get_input_quantization_params(export_stage.artifact) - qp_output = _get_output_quantization_params(export_stage.artifact, output_node) + qp_output = _get_output_quantization_params(output_nodes) logger.error(f"Input QuantArgs: {qp_input}") logger.error(f"Output QuantArgs: {qp_output}") diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index e5c700ec3c..5b2f9201fc 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -14,6 +14,7 @@ import serializer.tosa_serializer as ts import torch.fx +import torch.utils._pytree as pytree from executorch.backends.arm.arm_backend import get_intermediate_path from executorch.backends.arm.arm_partitioner import ArmPartitioner @@ -302,6 +303,7 @@ def run_method_and_compare_outputs( exported_program = self.stages[self.stage_name(tester.Export)].artifact edge_program = edge_stage.artifact.exported_program() + self.runner_util.init_run( exported_program, edge_program, @@ -309,14 +311,14 @@ def run_method_and_compare_outputs( target_board, ) - quantization_scale = None if is_quantized: reference_stage = self.stages[self.stage_name(tester.Quantize)] # bool output is quantized with none quantized output so allow # self.runner_util.qp_output to be none if self.runner_util.qp_output is not None: - quantization_scale = self.runner_util.qp_output.scale + quantization_scales = [qp.scale for qp in self.runner_util.qp_output] else: + quantization_scales = [None] * len(self.runner_util.output_nodes) reference_stage = self.stages[self.stage_name(InitialModel)] logger.info( @@ -334,21 +336,26 @@ def run_method_and_compare_outputs( input_shape_str = ", ".join([str(list(i)) for i in input_shapes]) logger.info(f"Run #{run_iteration}, input shapes: {input_shape_str}") - reference_output = reference_stage.run_artifact(reference_input) - if not isinstance(reference_output, tuple): - reference_output = (reference_output,) - test_output = test_stage.run_artifact(reference_input) - - self._compare_outputs( - reference_output, - test_output, - quantization_scale, - atol, - rtol, - qtol, - error_callbacks, + reference_outputs, _ = pytree.tree_flatten( + reference_stage.run_artifact(reference_input) + ) + test_outputs, _ = pytree.tree_flatten( + test_stage.run_artifact(reference_input) ) + for reference_output, test_output, quantization_scale in zip( + reference_outputs, test_outputs, quantization_scales + ): + self._compare_outputs( + reference_output, + test_output, + quantization_scale, + atol, + rtol, + qtol, + error_callbacks, + ) + return self def get_graph(self, stage: str | None = None) -> Graph: From cb45fb6ccb1a1b2dd170bc047617cc2e9ff592ab Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Fri, 17 Jan 2025 08:59:39 +0000 Subject: [PATCH 32/40] Fix uninitialized variable type-check in FuseQuantizedActivationPass (#7671) --- backends/arm/_passes/fuse_quantized_activation_pass.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py index 86836842bb..4eccea1a14 100644 --- a/backends/arm/_passes/fuse_quantized_activation_pass.py +++ b/backends/arm/_passes/fuse_quantized_activation_pass.py @@ -19,12 +19,13 @@ def _is_fuseable_quantized_activation(self, node: Node): is_fuseable = min_val == 0 is_quantized = len(node.users) == 1 and next(iter(node.users)).target == q_op - if is_quantized: + if is_fuseable and is_quantized: quant_node = next(iter(node.users)) zp = quant_node.args[2] qmin = quant_node.args[3] - - return is_fuseable and is_quantized and zp == qmin + return zp == qmin + else: + return False def _is_fuseable_input(self, node: Node): return ( From ffc20208dae8f4900da11bfffb76f749e7514132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Per=20=C3=85strand?= Date: Fri, 17 Jan 2025 11:24:37 +0100 Subject: [PATCH 33/40] Remove unused functions for quantization handling (#7700) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove functions not used for searching/finding quantization information. Signed-off-by: Per Åstrand --- .../annotate_channels_last_dim_order_pass.py | 7 +- backends/arm/operators/__init__.py | 2 - backends/arm/operators/op_dequant.py | 35 --- backends/arm/operators/op_hardtanh.py | 7 +- backends/arm/operators/op_quant.py | 35 --- backends/arm/operators/op_relu.py | 8 +- backends/arm/process_node.py | 22 +- backends/arm/tosa_quant_utils.py | 270 +----------------- backends/arm/tosa_utils.py | 28 -- examples/arm/aot_arm_compiler.py | 6 +- 10 files changed, 21 insertions(+), 399 deletions(-) delete mode 100644 backends/arm/operators/op_dequant.py delete mode 100644 backends/arm/operators/op_quant.py diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py index 80c5f3c442..4aff46de67 100644 --- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -15,7 +15,7 @@ get_node_arg, insert_q_dq_pair, ) -from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, register_passable_op +from executorch.backends.arm.tosa_quant_utils import dq_op, q_op from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -43,9 +43,6 @@ def _transpose_impl(*args, **kwargs): return args[0] -register_passable_op(torch.ops.passthrough_to_tosa._transpose) - - class AnnotateChannelsLastDimOrder(ExportPass): """ Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index 157e5ec092..a21bde535e 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -13,7 +13,6 @@ op_bmm, op_cat, op_conv2d, - op_dequant, op_exp, op_full, op_get_item, @@ -24,7 +23,6 @@ op_min, op_mul, op_permute, - op_quant, op_reciprocal, op_relu, op_repeat, diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py deleted file mode 100644 index 022f4e45ce..0000000000 --- a/backends/arm/operators/op_dequant.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2023-2024 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe -from typing import List - -import serializer.tosa_serializer as ts -import torch -from executorch.backends.arm.operators.node_visitor import ( - NodeVisitor, - register_node_visitor, -) -from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp - - -@register_node_visitor -class DequantVisitor(NodeVisitor): - target = "quantized_decomposed.dequantize_per_tensor.default" - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - item_name = inputs[0].name - ## Simply add an identityOp - tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name]) diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py index bfbab55b92..c971b50b66 100644 --- a/backends/arm/operators/op_hardtanh.py +++ b/backends/arm/operators/op_hardtanh.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024 Arm Limited and/or its affiliates. +# Copyright 2023-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -19,7 +19,6 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg -from executorch.backends.arm.tosa_quant_utils import quantize_value from serializer.tosa_serializer import TosaOp @@ -44,8 +43,8 @@ def define_node( input_qparams = get_input_qparams(node) # pyre-ignore[16] qargs = input_qparams[0] # Convert to quantized representation - clamp_min_qs = quantize_value(inputs[1].number, qargs) - clamp_max_qs = quantize_value(inputs[2].number, qargs) + clamp_min_qs = qargs.quantize_value(inputs[1].number).item() + clamp_max_qs = qargs.quantize_value(inputs[2].number).item() # Set fp values to 0.0 since they are not used clamp_min_fp = 0.0 clamp_max_fp = 0.0 diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py deleted file mode 100644 index fcf9372c11..0000000000 --- a/backends/arm/operators/op_quant.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2023-2024 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe -from typing import List - -import serializer.tosa_serializer as ts -import torch -from executorch.backends.arm.operators.node_visitor import ( - NodeVisitor, - register_node_visitor, -) -from executorch.backends.arm.tosa_mapping import TosaArg -from serializer.tosa_serializer import TosaOp - - -@register_node_visitor -class QuantVisitor(NodeVisitor): - target = "quantized_decomposed.quantize_per_tensor.default" - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: torch.fx.Node, - tosa_graph: ts.TosaSerializer, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - item_name = inputs[0].name - ## Simply add an identityOp - tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name]) diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py index 4df13e71b7..b5ffa2aa70 100644 --- a/backends/arm/operators/op_relu.py +++ b/backends/arm/operators/op_relu.py @@ -1,11 +1,10 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # pyre-unsafe -import executorch.backends.arm.tosa_quant_utils as tqutils import serializer.tosa_serializer as ts import torch.fx @@ -43,9 +42,8 @@ def define_node( clamp_max_qs = 0 if inputs[0].dtype == ts.DType.INT8: out_qargs = get_output_qparams(node) # pyre-ignore[16] - clamp_min_qs = tqutils.quantize_value(0, out_qargs[0]) - clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs[0]) - + clamp_min_qs = out_qargs[0].quantize_value(0).item() + clamp_max_qs = out_qargs[0].quantize_value(float("inf")).item() else: clamp_min_fp = 0 clamp_max_fp = float("inf") diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py index 9ab9c49044..36a1567df9 100644 --- a/backends/arm/process_node.py +++ b/backends/arm/process_node.py @@ -12,12 +12,7 @@ import torch import torch.fx from executorch.backends.arm.operators.node_visitor import NodeVisitor -from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg -from executorch.backends.arm.tosa_quant_utils import ( - dq_op, - get_quantized_node_output_dtype, - is_node_quantized, -) +from executorch.backends.arm.tosa_mapping import TosaArg from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape from torch.export.exported_program import ExportedProgram @@ -35,15 +30,8 @@ def process_call_function( # Convert output (this node itself) output = TosaArg(node) - is_dq_node = node.target == dq_op - if is_dq_node: - output_dtype = ts.DType.INT8 - else: - output_dtype = output.dtype tosa_graph.currRegion.currBasicBlock.addTensor( - output.name, - tosa_shape(output.shape, output.dim_order), - output_dtype, + output.name, tosa_shape(output.shape, output.dim_order), output.dtype ) # Visiting each Node @@ -79,11 +67,7 @@ def process_inputs( tensor = ts.TosaSerializerTensor( inputs[0].name, tosa_shape(input_shape, input_dim_order), - ( - map_dtype(get_quantized_node_output_dtype(node)) - if is_node_quantized(node) - else inputs[0].dtype - ), + inputs[0].dtype, data=None, placeholderFilename=inputs[0].name + ".npy", ) diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py index dff7b12cdd..9869a08c0b 100644 --- a/backends/arm/tosa_quant_utils.py +++ b/backends/arm/tosa_quant_utils.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024 Arm Limited and/or its affiliates. +# Copyright 2023-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -8,9 +8,7 @@ # Utiliy functions for TOSA quantized lowerings import math -from typing import Callable, cast, NamedTuple, Sequence - -import numpy as np +from typing import cast, NamedTuple import serializer.tosa_serializer as ts import torch.fx @@ -24,22 +22,6 @@ q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default dq_q_ops = (q_op, dq_op) -passable_ops = [ - exir_ops.edge.aten.view_copy.default, - exir_ops.edge.aten.permute_copy.default, - exir_ops.edge.aten.squeeze_copy.dims, - exir_ops.edge.aten.unsqueeze_copy.default, - exir_ops.edge.aten.split_with_sizes_copy.default, - exir_ops.edge.aten.repeat.default, - exir_ops.edge.aten.clone.default, - exir_ops.edge.aten.slice_copy.Tensor, - exir_ops.edge.aten.cat.default, -] - - -def register_passable_op(op): - """We need to be able to add custom ops such as tosa_transpose to the passable_op list after they have been created""" - passable_ops.append(op) def insert_rescale_ops_to_int32( @@ -53,8 +35,7 @@ def insert_rescale_ops_to_int32( This functions is used in serialization to TOSA for target ops that are handled by the DQ/D folding pass, which stores the quantization parameters - in the node meta dict as opposed to 'rescale_nodes_to_int32' which search - the graph upstream for DQ nodes. + in the node meta dict. """ # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.' @@ -100,13 +81,12 @@ def insert_rescale_op_to_int8( Parameters: node: The original node that is being handled by the rescales. last_tensor:the tosa tensor to rescale back. - scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32' + scale: the scaling factor used to rescale to int32, from the function 'insert_rescale_op_to_int32' tosa_graph: the tosa_graph to manipulate. This functions is used in serialization to TOSA for target ops that are handled by the DQ/D folding pass, which stores the quantization parameters - in the node meta dict as opposed to 'rescale_node_back_to_int8' which search - the graph downstream for Q nodes. + in the node meta dict. """ # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.' from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( @@ -148,17 +128,6 @@ def quantize_value(self, x): def dequantize_value(self, qx: torch.Tensor) -> torch.Tensor: return (qx - self.zp) * self.scale - def __eq__(self, other): - if isinstance(other, QuantArgs): - return ( - self.scale == other.scale - and self.zp == other.zp - and self.qmin == other.qmin - and self.qmax == other.qmax - and self.dtype == other.dtype - ) - return False - @classmethod def from_operator(cls, op, args): if op in dq_q_ops: @@ -174,172 +143,6 @@ def from_operator(cls, op, args): raise NotImplementedError -def quantize_value(x, qargs: QuantArgs, dtype=np.int8): - return np.clip( - np.round(x / qargs.scale) + qargs.zp, - qargs.qmin, - qargs.qmax, - ).astype(dtype) - - -def dequantize_value(qx, qargs: QuantArgs): - return (np.int64(qx) - qargs.zp) * qargs.scale - - -def qargs_from_qnode(node: torch.fx.Node): - assert node.target in dq_q_ops, f"Op {node} is not a quant node." - - return QuantArgs.from_operator(node.target, node.args) - - -def get_neighbour_quant_args( - node: torch.fx.Node, -) -> tuple[list[QuantArgs], list[QuantArgs]]: - user_q_args = [] - - for user in node.users: - q_args = search_quant_arg_downstream(user) - if q_args: - user_q_args.append(q_args) - - input_q_nodes = [] - for input_node in node.all_input_nodes: - q_args = search_quant_arg_upstream(input_node) - if q_args: - input_q_nodes.append(q_args) - return user_q_args, input_q_nodes - - -def all_q_args_equal(q_arg_list: list[QuantArgs]) -> bool: - first_q_arg = q_arg_list[0] - for q_arg in q_arg_list: - if q_arg != first_q_arg: - return False - return True - - -def is_node_quantized(node: torch.fx.Node) -> bool: - if node.target in dq_q_ops: - return True - - user_q_args, input_q_args = get_neighbour_quant_args(node) - - # If we did not find any neighbouring quant nodes, we are not quantized. - if len(input_q_args) == 0 and len(user_q_args) == 0: - return False - - if node.target in passable_ops: - assert all_q_args_equal( - user_q_args + input_q_args - ), f"Node {node} needs same quantization parameters on all inputs and outputs." - - return True - - -def search_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs | None: - """ - Iterates downward in the graph passing through 'passable_ops' to find and return a quantization node, - starting with 'node'. - If a passable node with multiple consumers is encountered, - find QuantArgs for all consumers and assert that they are equal. - If a node not in passable_ops is encountered, return None. - If a node without consumers is encountered, return None. - """ - if node.target in dq_q_ops: - return qargs_from_qnode(node) - if node.target not in passable_ops: - return None - consumer_nodes = list(node.users) - if len(consumer_nodes) == 0: - return None - elif len(consumer_nodes) == 1: - return search_quant_arg_downstream(consumer_nodes[0]) - else: - consumer_qargs: list[QuantArgs] = [] - for input in consumer_nodes: - quant_args = search_quant_arg_downstream(input) - if quant_args: - consumer_qargs.append(quant_args) - if len(consumer_qargs) == 0: - return None - assert all_q_args_equal( - consumer_qargs - ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different consumers." - return consumer_qargs[0] - - -def get_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs: - """Calls search_quant_arg_downstream and asserts that QuantArgs are found, - meaning return value can't be None. - """ - qargs = search_quant_arg_downstream(node) - assert qargs, f"Did not find QuantArgs downstream for node {node}" - return qargs - - -def search_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs | None: - """ - Iterates upward in the graph passing through 'passable_ops' to find and return a quantization node, - starting with 'node'. - If a passable node with multiple inputs is encountered, - find QuantArgs for all inputs and assert that they are equal. - If a node not in passable_ops is encountered, return None. - If a node without inputs is encountered, return None. - """ - - if node.target in dq_q_ops: - return qargs_from_qnode(node) - if node.target not in passable_ops: - return None - input_nodes = list(node.all_input_nodes) - if len(input_nodes) == 0: - return None - elif len(input_nodes) == 1: - return search_quant_arg_upstream(input_nodes[0]) - else: - input_qargs: list[QuantArgs] = [] - for input in input_nodes: - quant_args = search_quant_arg_upstream(input) - if quant_args: - input_qargs.append(quant_args) - if len(input_qargs) == 0: - return None - assert all_q_args_equal( - input_qargs - ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different inputs." - return input_qargs[0] - - -def get_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs: - """Calls search_quant_arg_upstream and asserts that QuantArgs are found, - meaning return value can't be None. - """ - qargs = search_quant_arg_upstream(node) - assert qargs, f"Did not find QuantArgs upstream for node {node}" - return qargs - - -def get_quantized_node_output_dtype(node: torch.fx.Node) -> torch.dtype: - if isinstance(node.target, Callable) and "output_qparams" in node.meta.keys(): - # Check if the node has had it's quantization parameters folded - # and retrieve the dtype from the meta dict in that case. - assert len(node.meta["output_qparams"]) == 1 - qargs = cast(QuantArgs, node.meta["output_qparams"][0]) - return qargs.dtype - - if node.target in dq_q_ops: - return cast(torch.dtype, node.args[5]) - - # if not a tosa node, nor a q/dq op, walk the graph until we find a q op - user_q_args, input_q_args = get_neighbour_quant_args(node) - if len(user_q_args) > 0: - return user_q_args[0].dtype - elif node.target in passable_ops and len(input_q_args) > 0: - return input_q_args[0].dtype - else: - raise RuntimeError("No quantized node found in graph") - - # Check if scale32 mode is used for given output element type def is_scale32(type): return type == ts.DType.INT8 @@ -476,69 +279,6 @@ def build_rescale_from_int32( return -def rescale_nodes_to_int32( - nodes: Sequence[Node], tosa_graph: ts.TosaSerializer -) -> tuple[list[TosaSerializerTensor], float]: - """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'. - The scales are adjusted using the smallest scale of all 'nodes'. - - Returns a list of the rescaled nodes and the scale factor used, - needed by rescale_node_back_to_int8. - """ - - tensors = [TosaArg(node) for node in nodes] - - # Reshape tensor according to tosa dim order - for tensor in tensors: - dim_order = tensor.dim_order - tensor.shape = [tensor.shape[i] for i in dim_order] - - qargs = [get_quant_arg_upstream(node) for node in nodes] - - # Scale the int8 quantized input to a common scale in the integer - # domain - min_scale = min([qarg.scale for qarg in qargs]) - scales = [qarg.scale / min_scale for qarg in qargs] - - rescaled_nodes: list[TosaSerializerTensor] = [] - for tensor, qarg, scale in zip(tensors, qargs, scales): - rescaled_nodes.append( - build_rescale_to_int32( - tosa_graph, - tensor, - qarg.zp, - scale, - ) - ) - return rescaled_nodes, min_scale - - -def rescale_node_back_to_int8( - node: Node, - last_tensor: TosaSerializerTensor, - scale: float, - tosa_graph: ts.TosaSerializer, -): - """Rescales the node back to int8, adding a suitable RESCALE op to 'tosa_graph'. - Parameters: - node: The original node that is being handled by the rescales. - last_tensor:the tosa tensor to rescale back. - scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32' - tosa_graph: the tosa_graph to manipulate. - """ - qargs_out = get_quant_arg_downstream(list(node.users)[0]) - output_rescale_scale = scale / qargs_out.scale - - # Rescale Back to INT8 - build_rescale_from_int32( - tosa_graph, - last_tensor.name, - node.name, - qargs_out.zp, - output_rescale_scale, - ) - - """ Creates a TOSA rescale op based on conv2d parameters. """ diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index c03e0ef0bb..9fefdbb3ff 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -115,10 +115,6 @@ def getNodeArgs(node: Node) -> list[TosaArg]: return [TosaArg(arg) for arg in node.args] -def get_input_tensor(node: Node) -> TosaArg: - return TosaArg(node.args[0]) - - def get_output_node(node: Node) -> Node: return list(node.users)[0] @@ -146,30 +142,6 @@ def is_consumer_node_depthwise_conv2d(node): return False -def get_two_inputs(node: Node, check: bool = False) -> tuple[Node, Node]: - """Returns two input nodes to 'node' in order. If 'node' only has one input, - it is returned twice. - - Fails if there are no input nodes. - Fails if there are >2 input nodes and 'check' is True, - """ - - num_inputs = len(node.all_input_nodes) - assert num_inputs > 0, f"Node '{node.name}' requires >0 input, got {num_inputs}." - - input1 = node.all_input_nodes[0] - if num_inputs == 1: - input2 = node.all_input_nodes[0] - else: - input2 = node.all_input_nodes[1] - if check: - assert ( - num_inputs <= 2 - ), f"Node '{node.name}' requires <=2 inputs, got {num_inputs}." - - return input1, input2 - - def tosa_shape(shape, dim_order): return tuple([shape[dim] for dim in dim_order]) diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 9563be93aa..a49436193b 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -264,7 +264,11 @@ def get_compile_spec( ) -> list[CompileSpec]: spec_builder = None if target == "TOSA": - spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI") + spec_builder = ( + ArmCompileSpecBuilder() + .tosa_compile_spec("TOSA-0.80+BI") + .set_quantize_io(True) + ) elif "ethos-u55" in target: spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec( target, From eaad7ff1ece5524b8892be9a3c40a3636ec2b64f Mon Sep 17 00:00:00 2001 From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com> Date: Fri, 17 Jan 2025 11:36:03 +0100 Subject: [PATCH 34/40] Revert "Remove unused functions for quantization handling" (#7724) Revert "Remove unused functions for quantization handling (#7700)" This reverts commit ffc20208dae8f4900da11bfffb76f749e7514132. --- .../annotate_channels_last_dim_order_pass.py | 7 +- backends/arm/operators/__init__.py | 2 + backends/arm/operators/op_dequant.py | 35 +++ backends/arm/operators/op_hardtanh.py | 7 +- backends/arm/operators/op_quant.py | 35 +++ backends/arm/operators/op_relu.py | 8 +- backends/arm/process_node.py | 22 +- backends/arm/tosa_quant_utils.py | 270 +++++++++++++++++- backends/arm/tosa_utils.py | 28 ++ examples/arm/aot_arm_compiler.py | 6 +- 10 files changed, 399 insertions(+), 21 deletions(-) create mode 100644 backends/arm/operators/op_dequant.py create mode 100644 backends/arm/operators/op_quant.py diff --git a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py index 4aff46de67..80c5f3c442 100644 --- a/backends/arm/_passes/annotate_channels_last_dim_order_pass.py +++ b/backends/arm/_passes/annotate_channels_last_dim_order_pass.py @@ -1,4 +1,4 @@ -# Copyright 2024-2025 Arm Limited and/or its affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -15,7 +15,7 @@ get_node_arg, insert_q_dq_pair, ) -from executorch.backends.arm.tosa_quant_utils import dq_op, q_op +from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, register_passable_op from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult @@ -43,6 +43,9 @@ def _transpose_impl(*args, **kwargs): return args[0] +register_passable_op(torch.ops.passthrough_to_tosa._transpose) + + class AnnotateChannelsLastDimOrder(ExportPass): """ Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index a21bde535e..157e5ec092 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -13,6 +13,7 @@ op_bmm, op_cat, op_conv2d, + op_dequant, op_exp, op_full, op_get_item, @@ -23,6 +24,7 @@ op_min, op_mul, op_permute, + op_quant, op_reciprocal, op_relu, op_repeat, diff --git a/backends/arm/operators/op_dequant.py b/backends/arm/operators/op_dequant.py new file mode 100644 index 0000000000..022f4e45ce --- /dev/null +++ b/backends/arm/operators/op_dequant.py @@ -0,0 +1,35 @@ +# Copyright 2023-2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import serializer.tosa_serializer as ts +import torch +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class DequantVisitor(NodeVisitor): + target = "quantized_decomposed.dequantize_per_tensor.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + ) -> None: + item_name = inputs[0].name + ## Simply add an identityOp + tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name]) diff --git a/backends/arm/operators/op_hardtanh.py b/backends/arm/operators/op_hardtanh.py index c971b50b66..bfbab55b92 100644 --- a/backends/arm/operators/op_hardtanh.py +++ b/backends/arm/operators/op_hardtanh.py @@ -1,4 +1,4 @@ -# Copyright 2023-2025 Arm Limited and/or its affiliates. +# Copyright 2023-2024 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -19,6 +19,7 @@ ) from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_quant_utils import quantize_value from serializer.tosa_serializer import TosaOp @@ -43,8 +44,8 @@ def define_node( input_qparams = get_input_qparams(node) # pyre-ignore[16] qargs = input_qparams[0] # Convert to quantized representation - clamp_min_qs = qargs.quantize_value(inputs[1].number).item() - clamp_max_qs = qargs.quantize_value(inputs[2].number).item() + clamp_min_qs = quantize_value(inputs[1].number, qargs) + clamp_max_qs = quantize_value(inputs[2].number, qargs) # Set fp values to 0.0 since they are not used clamp_min_fp = 0.0 clamp_max_fp = 0.0 diff --git a/backends/arm/operators/op_quant.py b/backends/arm/operators/op_quant.py new file mode 100644 index 0000000000..fcf9372c11 --- /dev/null +++ b/backends/arm/operators/op_quant.py @@ -0,0 +1,35 @@ +# Copyright 2023-2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import serializer.tosa_serializer as ts +import torch +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg +from serializer.tosa_serializer import TosaOp + + +@register_node_visitor +class QuantVisitor(NodeVisitor): + target = "quantized_decomposed.quantize_per_tensor.default" + + def __init__(self, *args): + super().__init__(*args) + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + ) -> None: + item_name = inputs[0].name + ## Simply add an identityOp + tosa_graph.addOperator(TosaOp.Op().IDENTITY, [item_name], [output.name]) diff --git a/backends/arm/operators/op_relu.py b/backends/arm/operators/op_relu.py index b5ffa2aa70..4df13e71b7 100644 --- a/backends/arm/operators/op_relu.py +++ b/backends/arm/operators/op_relu.py @@ -1,10 +1,11 @@ -# Copyright 2024-2025 Arm Limited and/or its affiliates. +# Copyright 2024 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. # pyre-unsafe +import executorch.backends.arm.tosa_quant_utils as tqutils import serializer.tosa_serializer as ts import torch.fx @@ -42,8 +43,9 @@ def define_node( clamp_max_qs = 0 if inputs[0].dtype == ts.DType.INT8: out_qargs = get_output_qparams(node) # pyre-ignore[16] - clamp_min_qs = out_qargs[0].quantize_value(0).item() - clamp_max_qs = out_qargs[0].quantize_value(float("inf")).item() + clamp_min_qs = tqutils.quantize_value(0, out_qargs[0]) + clamp_max_qs = tqutils.quantize_value(float("inf"), out_qargs[0]) + else: clamp_min_fp = 0 clamp_max_fp = float("inf") diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py index 36a1567df9..9ab9c49044 100644 --- a/backends/arm/process_node.py +++ b/backends/arm/process_node.py @@ -12,7 +12,12 @@ import torch import torch.fx from executorch.backends.arm.operators.node_visitor import NodeVisitor -from executorch.backends.arm.tosa_mapping import TosaArg +from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg +from executorch.backends.arm.tosa_quant_utils import ( + dq_op, + get_quantized_node_output_dtype, + is_node_quantized, +) from executorch.backends.arm.tosa_specification import TosaSpecification from executorch.backends.arm.tosa_utils import getNodeArgs, tosa_shape from torch.export.exported_program import ExportedProgram @@ -30,8 +35,15 @@ def process_call_function( # Convert output (this node itself) output = TosaArg(node) + is_dq_node = node.target == dq_op + if is_dq_node: + output_dtype = ts.DType.INT8 + else: + output_dtype = output.dtype tosa_graph.currRegion.currBasicBlock.addTensor( - output.name, tosa_shape(output.shape, output.dim_order), output.dtype + output.name, + tosa_shape(output.shape, output.dim_order), + output_dtype, ) # Visiting each Node @@ -67,7 +79,11 @@ def process_inputs( tensor = ts.TosaSerializerTensor( inputs[0].name, tosa_shape(input_shape, input_dim_order), - inputs[0].dtype, + ( + map_dtype(get_quantized_node_output_dtype(node)) + if is_node_quantized(node) + else inputs[0].dtype + ), data=None, placeholderFilename=inputs[0].name + ".npy", ) diff --git a/backends/arm/tosa_quant_utils.py b/backends/arm/tosa_quant_utils.py index 9869a08c0b..dff7b12cdd 100644 --- a/backends/arm/tosa_quant_utils.py +++ b/backends/arm/tosa_quant_utils.py @@ -1,4 +1,4 @@ -# Copyright 2023-2025 Arm Limited and/or its affiliates. +# Copyright 2023-2024 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -8,7 +8,9 @@ # Utiliy functions for TOSA quantized lowerings import math -from typing import cast, NamedTuple +from typing import Callable, cast, NamedTuple, Sequence + +import numpy as np import serializer.tosa_serializer as ts import torch.fx @@ -22,6 +24,22 @@ q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default dq_q_ops = (q_op, dq_op) +passable_ops = [ + exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.permute_copy.default, + exir_ops.edge.aten.squeeze_copy.dims, + exir_ops.edge.aten.unsqueeze_copy.default, + exir_ops.edge.aten.split_with_sizes_copy.default, + exir_ops.edge.aten.repeat.default, + exir_ops.edge.aten.clone.default, + exir_ops.edge.aten.slice_copy.Tensor, + exir_ops.edge.aten.cat.default, +] + + +def register_passable_op(op): + """We need to be able to add custom ops such as tosa_transpose to the passable_op list after they have been created""" + passable_ops.append(op) def insert_rescale_ops_to_int32( @@ -35,7 +53,8 @@ def insert_rescale_ops_to_int32( This functions is used in serialization to TOSA for target ops that are handled by the DQ/D folding pass, which stores the quantization parameters - in the node meta dict. + in the node meta dict as opposed to 'rescale_nodes_to_int32' which search + the graph upstream for DQ nodes. """ # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.' @@ -81,12 +100,13 @@ def insert_rescale_op_to_int8( Parameters: node: The original node that is being handled by the rescales. last_tensor:the tosa tensor to rescale back. - scale: the scaling factor used to rescale to int32, from the function 'insert_rescale_op_to_int32' + scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32' tosa_graph: the tosa_graph to manipulate. This functions is used in serialization to TOSA for target ops that are handled by the DQ/D folding pass, which stores the quantization parameters - in the node meta dict. + in the node meta dict as opposed to 'rescale_node_back_to_int8' which search + the graph downstream for Q nodes. """ # pyre-fixme[21]: 'Could not find a module corresponding to import `executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass`.' from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( @@ -128,6 +148,17 @@ def quantize_value(self, x): def dequantize_value(self, qx: torch.Tensor) -> torch.Tensor: return (qx - self.zp) * self.scale + def __eq__(self, other): + if isinstance(other, QuantArgs): + return ( + self.scale == other.scale + and self.zp == other.zp + and self.qmin == other.qmin + and self.qmax == other.qmax + and self.dtype == other.dtype + ) + return False + @classmethod def from_operator(cls, op, args): if op in dq_q_ops: @@ -143,6 +174,172 @@ def from_operator(cls, op, args): raise NotImplementedError +def quantize_value(x, qargs: QuantArgs, dtype=np.int8): + return np.clip( + np.round(x / qargs.scale) + qargs.zp, + qargs.qmin, + qargs.qmax, + ).astype(dtype) + + +def dequantize_value(qx, qargs: QuantArgs): + return (np.int64(qx) - qargs.zp) * qargs.scale + + +def qargs_from_qnode(node: torch.fx.Node): + assert node.target in dq_q_ops, f"Op {node} is not a quant node." + + return QuantArgs.from_operator(node.target, node.args) + + +def get_neighbour_quant_args( + node: torch.fx.Node, +) -> tuple[list[QuantArgs], list[QuantArgs]]: + user_q_args = [] + + for user in node.users: + q_args = search_quant_arg_downstream(user) + if q_args: + user_q_args.append(q_args) + + input_q_nodes = [] + for input_node in node.all_input_nodes: + q_args = search_quant_arg_upstream(input_node) + if q_args: + input_q_nodes.append(q_args) + return user_q_args, input_q_nodes + + +def all_q_args_equal(q_arg_list: list[QuantArgs]) -> bool: + first_q_arg = q_arg_list[0] + for q_arg in q_arg_list: + if q_arg != first_q_arg: + return False + return True + + +def is_node_quantized(node: torch.fx.Node) -> bool: + if node.target in dq_q_ops: + return True + + user_q_args, input_q_args = get_neighbour_quant_args(node) + + # If we did not find any neighbouring quant nodes, we are not quantized. + if len(input_q_args) == 0 and len(user_q_args) == 0: + return False + + if node.target in passable_ops: + assert all_q_args_equal( + user_q_args + input_q_args + ), f"Node {node} needs same quantization parameters on all inputs and outputs." + + return True + + +def search_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs | None: + """ + Iterates downward in the graph passing through 'passable_ops' to find and return a quantization node, + starting with 'node'. + If a passable node with multiple consumers is encountered, + find QuantArgs for all consumers and assert that they are equal. + If a node not in passable_ops is encountered, return None. + If a node without consumers is encountered, return None. + """ + if node.target in dq_q_ops: + return qargs_from_qnode(node) + if node.target not in passable_ops: + return None + consumer_nodes = list(node.users) + if len(consumer_nodes) == 0: + return None + elif len(consumer_nodes) == 1: + return search_quant_arg_downstream(consumer_nodes[0]) + else: + consumer_qargs: list[QuantArgs] = [] + for input in consumer_nodes: + quant_args = search_quant_arg_downstream(input) + if quant_args: + consumer_qargs.append(quant_args) + if len(consumer_qargs) == 0: + return None + assert all_q_args_equal( + consumer_qargs + ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different consumers." + return consumer_qargs[0] + + +def get_quant_arg_downstream(node: torch.fx.Node) -> QuantArgs: + """Calls search_quant_arg_downstream and asserts that QuantArgs are found, + meaning return value can't be None. + """ + qargs = search_quant_arg_downstream(node) + assert qargs, f"Did not find QuantArgs downstream for node {node}" + return qargs + + +def search_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs | None: + """ + Iterates upward in the graph passing through 'passable_ops' to find and return a quantization node, + starting with 'node'. + If a passable node with multiple inputs is encountered, + find QuantArgs for all inputs and assert that they are equal. + If a node not in passable_ops is encountered, return None. + If a node without inputs is encountered, return None. + """ + + if node.target in dq_q_ops: + return qargs_from_qnode(node) + if node.target not in passable_ops: + return None + input_nodes = list(node.all_input_nodes) + if len(input_nodes) == 0: + return None + elif len(input_nodes) == 1: + return search_quant_arg_upstream(input_nodes[0]) + else: + input_qargs: list[QuantArgs] = [] + for input in input_nodes: + quant_args = search_quant_arg_upstream(input) + if quant_args: + input_qargs.append(quant_args) + if len(input_qargs) == 0: + return None + assert all_q_args_equal( + input_qargs + ), f"Encountered a op, {node}, in passable_ops with different QuantArgs for different inputs." + return input_qargs[0] + + +def get_quant_arg_upstream(node: torch.fx.Node) -> QuantArgs: + """Calls search_quant_arg_upstream and asserts that QuantArgs are found, + meaning return value can't be None. + """ + qargs = search_quant_arg_upstream(node) + assert qargs, f"Did not find QuantArgs upstream for node {node}" + return qargs + + +def get_quantized_node_output_dtype(node: torch.fx.Node) -> torch.dtype: + if isinstance(node.target, Callable) and "output_qparams" in node.meta.keys(): + # Check if the node has had it's quantization parameters folded + # and retrieve the dtype from the meta dict in that case. + assert len(node.meta["output_qparams"]) == 1 + qargs = cast(QuantArgs, node.meta["output_qparams"][0]) + return qargs.dtype + + if node.target in dq_q_ops: + return cast(torch.dtype, node.args[5]) + + # if not a tosa node, nor a q/dq op, walk the graph until we find a q op + user_q_args, input_q_args = get_neighbour_quant_args(node) + if len(user_q_args) > 0: + return user_q_args[0].dtype + elif node.target in passable_ops and len(input_q_args) > 0: + return input_q_args[0].dtype + else: + raise RuntimeError("No quantized node found in graph") + + # Check if scale32 mode is used for given output element type def is_scale32(type): return type == ts.DType.INT8 @@ -279,6 +476,69 @@ def build_rescale_from_int32( return +def rescale_nodes_to_int32( + nodes: Sequence[Node], tosa_graph: ts.TosaSerializer +) -> tuple[list[TosaSerializerTensor], float]: + """Rescales all 'nodes' to int32, adding suitable RESCALE ops to 'tosa_graph'. + The scales are adjusted using the smallest scale of all 'nodes'. + + Returns a list of the rescaled nodes and the scale factor used, + needed by rescale_node_back_to_int8. + """ + + tensors = [TosaArg(node) for node in nodes] + + # Reshape tensor according to tosa dim order + for tensor in tensors: + dim_order = tensor.dim_order + tensor.shape = [tensor.shape[i] for i in dim_order] + + qargs = [get_quant_arg_upstream(node) for node in nodes] + + # Scale the int8 quantized input to a common scale in the integer + # domain + min_scale = min([qarg.scale for qarg in qargs]) + scales = [qarg.scale / min_scale for qarg in qargs] + + rescaled_nodes: list[TosaSerializerTensor] = [] + for tensor, qarg, scale in zip(tensors, qargs, scales): + rescaled_nodes.append( + build_rescale_to_int32( + tosa_graph, + tensor, + qarg.zp, + scale, + ) + ) + return rescaled_nodes, min_scale + + +def rescale_node_back_to_int8( + node: Node, + last_tensor: TosaSerializerTensor, + scale: float, + tosa_graph: ts.TosaSerializer, +): + """Rescales the node back to int8, adding a suitable RESCALE op to 'tosa_graph'. + Parameters: + node: The original node that is being handled by the rescales. + last_tensor:the tosa tensor to rescale back. + scale: the scaling factor used to rescale to int32, from the function 'rescale_nodes_to_int32' + tosa_graph: the tosa_graph to manipulate. + """ + qargs_out = get_quant_arg_downstream(list(node.users)[0]) + output_rescale_scale = scale / qargs_out.scale + + # Rescale Back to INT8 + build_rescale_from_int32( + tosa_graph, + last_tensor.name, + node.name, + qargs_out.zp, + output_rescale_scale, + ) + + """ Creates a TOSA rescale op based on conv2d parameters. """ diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py index 9fefdbb3ff..c03e0ef0bb 100644 --- a/backends/arm/tosa_utils.py +++ b/backends/arm/tosa_utils.py @@ -115,6 +115,10 @@ def getNodeArgs(node: Node) -> list[TosaArg]: return [TosaArg(arg) for arg in node.args] +def get_input_tensor(node: Node) -> TosaArg: + return TosaArg(node.args[0]) + + def get_output_node(node: Node) -> Node: return list(node.users)[0] @@ -142,6 +146,30 @@ def is_consumer_node_depthwise_conv2d(node): return False +def get_two_inputs(node: Node, check: bool = False) -> tuple[Node, Node]: + """Returns two input nodes to 'node' in order. If 'node' only has one input, + it is returned twice. + + Fails if there are no input nodes. + Fails if there are >2 input nodes and 'check' is True, + """ + + num_inputs = len(node.all_input_nodes) + assert num_inputs > 0, f"Node '{node.name}' requires >0 input, got {num_inputs}." + + input1 = node.all_input_nodes[0] + if num_inputs == 1: + input2 = node.all_input_nodes[0] + else: + input2 = node.all_input_nodes[1] + if check: + assert ( + num_inputs <= 2 + ), f"Node '{node.name}' requires <=2 inputs, got {num_inputs}." + + return input1, input2 + + def tosa_shape(shape, dim_order): return tuple([shape[dim] for dim in dim_order]) diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index a49436193b..9563be93aa 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -264,11 +264,7 @@ def get_compile_spec( ) -> list[CompileSpec]: spec_builder = None if target == "TOSA": - spec_builder = ( - ArmCompileSpecBuilder() - .tosa_compile_spec("TOSA-0.80+BI") - .set_quantize_io(True) - ) + spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI") elif "ethos-u55" in target: spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec( target, From 5b9ab56657dabda161e866d4a574172f974b20c8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 17 Jan 2025 09:16:12 -0800 Subject: [PATCH 35/40] install_requirements.py: reorganize requirements (#7705) Duplicate requirements with the pyproject.toml > /dev/null One unique devel reqiurement > requirements-dev.txt Examples requirements > requirements-examples.txt Nightlies stayed in the script. Rationale: be as "normal" a Python project as seemed possible. Test Plan: install_requirements.sh in a clean venv succeeded --- install_requirements.py | 25 +++++-------------------- requirements-examples.txt | 5 +++++ 2 files changed, 10 insertions(+), 20 deletions(-) create mode 100644 requirements-examples.txt diff --git a/install_requirements.py b/install_requirements.py index 409460ca10..52ba89edd7 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -104,34 +104,15 @@ def install_requirements(use_pytorch_nightly): if use_pytorch_nightly else "torchvision" ), # For testing. - "typing-extensions", ] - # pip packages needed to run examples. - # TODO: Make each example publish its own requirements.txt EXAMPLES_REQUIREMENTS = [ - "timm==1.0.7", f"torchaudio==2.6.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchaudio", - "torchsr==1.0.4", - "transformers==4.47.1", - ] - - # pip packages needed for development. - DEVEL_REQUIREMENTS = [ - "cmake", # For building binary targets. - "pip>=23", # For building the pip package. - "pyyaml", # Imported by the kernel codegen tools. - "setuptools>=63", # For building the pip package. - "tomli", # Imported by extract_sources.py when using python < 3.11. - "wheel", # For building the pip package archive. - "zstd", # Imported by resolve_buck.py. ] # Assemble the list of requirements to actually install. # TODO: Add options for reducing the number of requirements. - REQUIREMENTS_TO_INSTALL = ( - EXIR_REQUIREMENTS + DEVEL_REQUIREMENTS + EXAMPLES_REQUIREMENTS - ) + REQUIREMENTS_TO_INSTALL = EXIR_REQUIREMENTS + EXAMPLES_REQUIREMENTS # Install the requirements. `--extra-index-url` tells pip to look for package # versions on the provided URL if they aren't available on the default URL. @@ -141,6 +122,8 @@ def install_requirements(use_pytorch_nightly): "-m", "pip", "install", + "-r", + "requirements-examples.txt", *REQUIREMENTS_TO_INSTALL, "--extra-index-url", TORCH_NIGHTLY_URL, @@ -160,6 +143,8 @@ def install_requirements(use_pytorch_nightly): "-m", "pip", "install", + # Without --no-build-isolation, setup.py can't find the torch module. + "--no-build-isolation", *LOCAL_REQUIREMENTS, ], check=True, diff --git a/requirements-examples.txt b/requirements-examples.txt new file mode 100644 index 0000000000..d4126a178a --- /dev/null +++ b/requirements-examples.txt @@ -0,0 +1,5 @@ +# pip packages needed to run examples. +# TODO: Make each example publish its own requirements.txt +timm == 1.0.7 +torchsr == 1.0.4 +transformers ==4.47.1 From 5dfbf478958577b96ccd14cdae235ed35cda27b1 Mon Sep 17 00:00:00 2001 From: JP <46308822+zonglinpeng@users.noreply.github.com> Date: Fri, 17 Jan 2025 10:04:08 -0800 Subject: [PATCH 36/40] fix g3 dequant Differential Revision: D68109702 Pull Request resolved: https://github.com/pytorch/executorch/pull/7683 --- .../cadence/fusion_g3/operators/op_dequantize.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp index cff50f2a90..3e0235170b 100644 --- a/backends/cadence/fusion_g3/operators/op_dequantize.cpp +++ b/backends/cadence/fusion_g3/operators/op_dequantize.cpp @@ -67,8 +67,8 @@ void check_dequantize_per_tensor_args( ET_CHECK_MSG( input.scalar_type() == dtype, - "input.scalar_type() %" PRId8 " is not matching dtype argumenta:", - static_cast(input.scalar_type())); + "input.scalar_type() %s is not matching dtype arguments:", + ::executorch::runtime::toString(input.scalar_type())); if (out_dtype.has_value()) { ET_CHECK_MSG( @@ -561,11 +561,12 @@ Tensor& dequantize_per_tensor_out( const Tensor& input, double scale, int64_t zero_point, - int64_t quant_min, - int64_t quant_max, + __ET_UNUSED int64_t quant_min, + __ET_UNUSED int64_t quant_max, ScalarType dtype, - ::executorch::aten::optional out_dtype, Tensor& out) { + constexpr ScalarType out_dtype = ScalarType::Float; + #ifdef OP_ARG_CHECK torch::executor::Error err = resize_tensor(out, input.sizes()); ET_CHECK_MSG( From ce77ee7c4363d6c370c5e52da2b85f67f70943d1 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 17 Jan 2025 10:30:36 -0800 Subject: [PATCH 37/40] Fix linux_job_v2 after https://github.com/pytorch/test-infra/pull/6104 (#7731) --- .github/workflows/pull.yml | 3 +++ .github/workflows/trunk.yml | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 8b32e46cf2..b629a52e72 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -333,6 +333,9 @@ jobs: unittest-arm: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-arm-sdk diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 90bd0eb6ef..0cbbe6f643 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -132,6 +132,9 @@ jobs: test-arm-backend-delegation: name: test-arm-backend-delegation uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-arm-sdk @@ -159,6 +162,9 @@ jobs: test-arm-reference-delegation: name: test-arm-reference-delegation uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read with: runner: linux.2xlarge docker-image: executorch-ubuntu-22.04-arm-sdk From a8c46d1e43b6d4691efba1b003666af2e8f578f5 Mon Sep 17 00:00:00 2001 From: JP <46308822+zonglinpeng@users.noreply.github.com> Date: Fri, 17 Jan 2025 10:49:46 -0800 Subject: [PATCH 38/40] migrate facto utils to OSS Differential Revision: D68195666 Pull Request resolved: https://github.com/pytorch/executorch/pull/7686 --- examples/cadence/operators/facto_util.py | 91 ++++++++++++++++++++++++ examples/cadence/operators/targets.bzl | 14 ++++ 2 files changed, 105 insertions(+) create mode 100644 examples/cadence/operators/facto_util.py diff --git a/examples/cadence/operators/facto_util.py b/examples/cadence/operators/facto_util.py new file mode 100644 index 0000000000..e9b16f8bf6 --- /dev/null +++ b/examples/cadence/operators/facto_util.py @@ -0,0 +1,91 @@ +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +# pyre-strict + +import copy +from typing import List, OrderedDict, Tuple + +import torch +from inputgen.argtuple.gen import ArgumentTupleGenerator +from inputgen.specs.model import ConstraintProducer as cp +from inputgen.utils.random_manager import random_manager +from inputgen.variable.type import ScalarDtype +from specdb.db import SpecDictDB + +# seed to generate identical cases every run to reproduce from bisect +random_manager.seed(1729) + + +def apply_tensor_contraints(op_name: str, tensor_constraints: list[object]) -> None: + match op_name: + case ( + "sigmoid.default" + | "_softmax.default" + | "rsqrt.default" + | "exp.default" + | "mul.Tensor" + | "div.Tensor" + ): + tensor_constraints.append( + cp.Dtype.In(lambda deps: [torch.float]), + ) + case ( + "add.Tensor" + | "sub.Tensor" + | "add.Scalar" + | "sub.Scalar" + | "mul.Scalar" + | "div.Scalar" + ): + tensor_constraints.append( + cp.Dtype.In(lambda deps: [torch.float, torch.int]), + ) + case _: + tensor_constraints.append( + cp.Dtype.In(lambda deps: [torch.float, torch.int]), + ) + tensor_constraints.extend( + [ + cp.Value.Ge(lambda deps, dtype, struct: -(2**8)), + cp.Value.Le(lambda deps, dtype, struct: 2**8), + cp.Rank.Ge(lambda deps: 1), + cp.Rank.Le(lambda deps: 2**2), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) + + +def facto_testcase_gen(op_name: str) -> List[Tuple[List[str], OrderedDict[str, str]]]: + # minimal example to test add.Tensor using FACTO + spec = SpecDictDB[op_name] + + for index, in_spec in enumerate(copy.deepcopy(spec.inspec)): + if in_spec.type.is_scalar(): + if in_spec.name != "alpha": + spec.inspec[index].constraints.extend( + [ + cp.Dtype.In(lambda deps: [ScalarDtype.float, ScalarDtype.int]), + cp.Value.Ge(lambda deps, dtype: -(2**8)), + cp.Value.Le(lambda deps, dtype: 2**2), + cp.Size.Ge(lambda deps, r, d: 1), + cp.Size.Le(lambda deps, r, d: 2**2), + ] + ) + else: + spec.inspec[index].constraints.extend( + [ + cp.Value.Gt(lambda deps, dtype: 0), + cp.Value.Le(lambda deps, dtype: 2), + ] + ) + elif in_spec.type.is_tensor(): + tensor_constraints = [] + # common tensor constraints + apply_tensor_contraints(op_name, tensor_constraints) + spec.inspec[index].constraints.extend(tensor_constraints) + + return [ + (posargs, inkwargs) + for posargs, inkwargs, _ in ArgumentTupleGenerator(spec).gen() + ] diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl index e1fbeb9fdf..a646f0076b 100644 --- a/examples/cadence/operators/targets.bzl +++ b/examples/cadence/operators/targets.bzl @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") TESTS_LIST = [ "add_op", @@ -16,6 +17,19 @@ def define_common_targets(): for op in TESTS_LIST: _define_test_target(op) + python_library( + name = "facto_util", + srcs = [ + "facto_util.py", + ], + typing = True, + deps = [ + "fbcode//caffe2:torch", + "fbcode//pytorch/facto:inputgen", + "fbcode//pytorch/facto:specdb", + ], + ) + def _define_test_target(test_name): file_name = "test_{}".format(test_name) From 04f764e2e21084fd271e3439ab2a609a00b6faf5 Mon Sep 17 00:00:00 2001 From: JP <46308822+zonglinpeng@users.noreply.github.com> Date: Fri, 17 Jan 2025 10:56:28 -0800 Subject: [PATCH 39/40] fix typo in cadence cp quantized_conv_out Differential Revision: D68278032 Pull Request resolved: https://github.com/pytorch/executorch/pull/7706 --- backends/cadence/reference/operators/quantized_conv_out.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp index 5a7af85809..b18159a0b3 100644 --- a/backends/cadence/reference/operators/quantized_conv_out.cpp +++ b/backends/cadence/reference/operators/quantized_conv_out.cpp @@ -119,7 +119,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic( if (((_h + d0 * _wh - p0) >= 0) && ((_h + d0 * _wh - p0) < h) && ((_w + d1 * _ww - p1) >= 0) && - ((_w + d1 * _ww - p1 < w))) { + ((_w + d1 * _ww - p1) < w)) { int ioff = (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); int woff = _wh * ww + _ww; From 8494b9085605229954e869f5aa18e82b20c9ead0 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 17 Jan 2025 13:48:59 -0600 Subject: [PATCH 40/40] [ET-VK][ez] Fix `conv2d_pw` shared memory buffer not having a constant size (#7734) ## Context `conv2d_pw` was failing to compile on Mac due to the shared memory array not having a constant size. Simply hardcode the workgroup size to fix. Differential Revision: [D68331984](https://our.internmc.facebook.com/intern/diff/D68331984/) ghstack-source-id: 261911463 Pull Request resolved: https://github.com/pytorch/executorch/pull/7729 Co-authored-by: Stephen Jia --- backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl index a5a2097cd5..f72c487fa7 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl @@ -33,7 +33,9 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; // shared memory to hold calculated positions, this would reduce register usage thus improving performance. -shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE]; +// 64 is the number of threads in the local wg +$num_shared = 64 * TILE_SIZE * TILE_SIZE +shared ivec2 pos_shared[${num_shared}]; /* * Computes a 2D pointwise convolution of an NxN output tile. Calculating an