diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh index deeaed34ac..f256e8eec6 100644 --- a/.ci/scripts/build-qnn-sdk.sh +++ b/.ci/scripts/build-qnn-sdk.sh @@ -1,5 +1,6 @@ #!/bin/bash # Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -11,10 +12,16 @@ set -o xtrace build_qnn_backend() { echo "Start building qnn backend." export ANDROID_NDK_ROOT=/opt/ndk - export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 + export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" - bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release + # Workaround to avoid issues around missing flatccrt library (depending on the + # number of jobs used), see issue #7300: + # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is + # available. + # TODO: Remove this workaround once the underlying issue is fixed. + bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \ + bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean } set_up_aot() { diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh index 1280974812..45588e291e 100644 --- a/.ci/scripts/setup-qnn-deps.sh +++ b/.ci/scripts/setup-qnn-deps.sh @@ -16,9 +16,9 @@ install_qnn() { QNN_INSTALLATION_DIR=/tmp/qnn mkdir -p "${QNN_INSTALLATION_DIR}" - curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip" + curl -Lo /tmp/v2.28.0.24.10.29.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip" echo "Finishing downloading qnn sdk." - unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp + unzip -qo /tmp/v2.28.0.24.10.29.zip -d /tmp echo "Finishing unzip qnn sdk." diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index ddc7ad4618..550a09e4c6 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -121,7 +121,7 @@ echo "COREML option ${COREML}" if [[ "${MODE}" =~ .*qnn.* ]]; then QNN=ON export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)" - export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 + export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang" export PYTHONPATH=".." cp schema/program.fbs exir/_serialize/program.fbs diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index 9c2a074372..0b4e27e5aa 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -28,6 +28,7 @@ ) from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass +from executorch.backends.arm._passes.decompose_select import DecomposeSelectPass from executorch.backends.arm._passes.decompose_softmaxes_pass import ( DecomposeSoftmaxesPass, ) @@ -62,7 +63,6 @@ ) from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass from executorch.exir import ExportedProgram -from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_manager import PassManager @@ -72,9 +72,7 @@ class ArmPassManager(PassManager): def _transform(self, graph_module: torch.fx.GraphModule): return self(graph_module).graph_module - def transform_to_backend_pipeline( - self, exported_program: ExportedProgram, compile_spec: list[CompileSpec] - ): + def transform_to_backend_pipeline(self, exported_program: ExportedProgram): """Apply passes before transforming program to backend""" self.add_pass(FuseQuantizedActivationPass()) self.add_pass(DecomposeLinearPass()) @@ -137,11 +135,8 @@ def transform_to_backend_pipeline( self.add_pass(KeepDimsFalseToSqueezePass()) self.add_pass(Conv1dUnsqueezePass(exported_program)) self.add_pass(DecomposeSoftmaxesPass()) - for spec in compile_spec: - if spec.key == "permute_memory_format": - memory_format = spec.value.decode() - if memory_format == "nhwc": - self.add_pass(AnnotateChannelsLastDimOrder()) + self.add_pass(DecomposeSelectPass()) + self.add_pass(AnnotateChannelsLastDimOrder()) return self._transform(exported_program.graph_module) diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py new file mode 100644 index 0000000000..9ea836e633 --- /dev/null +++ b/backends/arm/_passes/decompose_select.py @@ -0,0 +1,56 @@ +# Copyright 2025 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import torch +from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ExportPass, PassResult + + +class DecomposeSelectPass(ExportPass): + """ + This pass decomposes select into slice + squeeze to ensure that Aten and TOSA outputs has the same rank (input rank -1) + """ + + def call(self, graph_module: torch.fx.GraphModule): + for node in graph_module.graph.nodes: + + if node.op != "call_function": + continue + + if node.target in ( + exir_ops.edge.aten.select.int, + exir_ops.edge.aten.select_copy.int, + ): + slice_op = exir_ops.edge.aten.slice_copy.Tensor + squeeze_op = exir_ops.edge.aten.squeeze_copy.dims + else: + continue + + input_node, dim, index = node.args + + rank = len(input_node.meta["val"].size()) + dim = dim % rank if dim < 0 else dim + index = index % rank if index < 0 else index + dim_list = list(range(rank)) + + with graph_module.graph.inserting_before(node): + slice_node = create_node( + graph_module.graph, slice_op, (input_node, dim, index, index + 1) + ) + squeeze_node = create_node( + graph_module.graph, squeeze_op, (slice_node, dim_list) + ) + + node.replace_all_uses_with(squeeze_node) + graph_module.graph.erase_node(node) + + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + return PassResult(graph_module, True) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index e2fdc42b11..4ce95fda43 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024 Arm Limited and/or its affiliates. +# Copyright 2023-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -49,8 +49,6 @@ def __init__(self): self.compiler_flags = [] self.output_format = None self.path_for_intermediates = None - # TODO MLETORCH-265 Remove permute_nhwc flag - self.permute_nhwc = False self.quantize_io = False self.tosa_version = None self.input_order = None @@ -118,16 +116,6 @@ def dump_intermediate_artifacts_to( self.path_for_intermediates = output_path return self - def set_permute_memory_format( - self, set_nhwc_permutation: bool = True - ) -> "ArmCompileSpecBuilder": - """ - Permute to channel last in compiler and runtime. Compilation and - runtime will convert rank 4 inputs to channel last for each sub-graph. - """ - self.permute_nhwc = set_nhwc_permutation - return self - def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder": """ Quantization of inputs and dequantization of outputs for cases where @@ -170,11 +158,6 @@ def build(self) -> List[CompileSpec]: CompileSpec("debug_artifact_path", self.path_for_intermediates.encode()) ) - if self.permute_nhwc: - self.compile_spec.append( - CompileSpec("permute_memory_format", "nhwc".encode()) - ) - if self.input_order: self.compile_spec.append( CompileSpec( @@ -188,13 +171,6 @@ def build(self) -> List[CompileSpec]: return self.compile_spec -def is_permute_memory(compile_spec: List[CompileSpec]) -> bool: - for spec in compile_spec: - if spec.key == "permute_memory_format": - return spec.value.decode() == "nhwc" - return False - - def is_tosa(compile_spec: List[CompileSpec]) -> bool: for spec in compile_spec: if spec.key == "output_format": @@ -264,7 +240,7 @@ def preprocess( # noqa: C901 # const data directly. Path created and data written only in debug builds. tosa_graph = ts.TosaSerializer(artifact_path) graph_module = ArmPassManager().transform_to_backend_pipeline( - exported_program=edge_program, compile_spec=compile_spec + exported_program=edge_program ) node_visitors = get_node_visitors(edge_program, tosa_spec) diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index ee5f2807a9..157e5ec092 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -30,7 +30,6 @@ op_repeat, op_rshift, op_rsqrt, - op_select, op_sigmoid, op_slice, op_squeeze, diff --git a/backends/arm/operators/op_select.py b/backends/arm/operators/op_select.py deleted file mode 100644 index b047a5dd47..0000000000 --- a/backends/arm/operators/op_select.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2024 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -from typing import List - -import serializer.tosa_serializer as ts -from executorch.backends.arm.operators.node_visitor import ( - NodeVisitor, - register_node_visitor, -) - -from executorch.backends.arm.tosa_mapping import TosaArg - -from executorch.backends.arm.tosa_utils import build_reshape, tosa_shape -from serializer.tosa_serializer import TosaOp -from torch.fx import Node - - -@register_node_visitor -class SelectVisitor(NodeVisitor): - target = "aten.select_copy.int" - - def __init__(self, *args): - super().__init__(*args) - - def define_node( - self, - node: Node, - tosa_graph: ts.TosaSerializer, - inputs: List[TosaArg], - output: TosaArg, - ) -> None: - - assert len(inputs) == 3 - input_node, dim, index = inputs - shape = input_node.shape - rank = len(shape) - - dim = dim.number % rank if dim.number < 0 else dim.number - index = index.number % rank if index.number < 0 else index.number - - # For aten.select_copy, the output will be rank[input_shape - 1] - # For TOSA rank(in) == rank(out). - # Add an intermediate with the same rank - expanded_shape = tuple(1 if i == dim else shape[i] for i in range(rank)) - expanded_shape = tosa_shape(expanded_shape, input_node.dim_order) - - output_reshaped = tosa_graph.addIntermediate(expanded_shape, output.dtype) - - attr_slice = ts.TosaSerializerAttribute() - - start_attr = [index if i == dim else 0 for i in input_node.dim_order] - size_attr = [ - 1 if i == dim else input_node.shape[i] for i in input_node.dim_order - ] - - attr_slice.SliceAttribute(start_attr, size_attr) - - tosa_graph.addOperator( - TosaOp.Op().SLICE, [input_node.name], [output_reshaped.name], attr_slice - ) - - # Reshape back to original rank of output. - build_reshape(tosa_graph, output_reshaped.name, output.shape, output.name) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 2cc716391b..843e48603b 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 Arm Limited and/or its affiliates. + * Copyright 2023-2025 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -76,7 +76,6 @@ namespace arm { typedef struct { FreeableBuffer* processed; - bool permuted_io_flag; } ExecutionHandle; extern "C" { @@ -125,14 +124,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle); handle->processed = processed; - handle->permuted_io_flag = false; - for (auto& compile_spec : compile_specs) { - if (0 == std::strcmp(compile_spec.key, "permute_memory_format") && - 0 == std::memcmp(compile_spec.value.buffer, "nhwc", 4)) { - handle->permuted_io_flag = true; - } - } - // Return the same buffer we were passed - this data will be // executed directly return handle; @@ -225,11 +216,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { // which require permutation. bool permuted_input_shape; ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( - i, - tensor_in, - &handles.inputs->io[i], - execution_handle->permuted_io_flag, - &permuted_input_shape)); + i, tensor_in, &handles.inputs->io[i], &permuted_input_shape)); bool both_char = tensor_in.scalar_type() == ScalarType::Char and handles.inputs->io[i].elem_size == 1; bool both_int = tensor_in.scalar_type() == ScalarType::Int and @@ -330,11 +317,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { bool permuted_output_shape; ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( - i, - tensor_out, - &handles.outputs->io[i], - execution_handle->permuted_io_flag, - &permuted_output_shape)); + i, tensor_out, &handles.outputs->io[i], &permuted_output_shape)); if (tensor_out.scalar_type() == ScalarType::Char and permuted_output_shape) { EXECUTORCH_PROF_SCOPE( @@ -395,7 +378,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { int index, const executorch::aten::Tensor tensor, VelaIO* io, - bool permuted_io_flag, bool* is_permuted) const { bool permuted_shape = false; @@ -409,12 +391,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { if (permuted_shape) { ET_LOG(Debug, "Tensor input/output %d will be permuted", index); } - if (permuted_io_flag != permuted_shape) { - ET_LOG( - Error, - "Permute compile flag and permuted input/output don't agree"); - return Error::InvalidProgram; - } } *is_permuted = permuted_shape; return Error::Ok; diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index 8838cb72d6..ba80f1c2d7 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -56,19 +56,15 @@ def maybe_get_tosa_collate_path() -> str | None: return None -def get_tosa_compile_spec( - tosa_version: str, permute_memory_to_nhwc=True, custom_path=None -) -> list[CompileSpec]: +def get_tosa_compile_spec(tosa_version: str, custom_path=None) -> list[CompileSpec]: """ Default compile spec for TOSA tests. """ - return get_tosa_compile_spec_unbuilt( - tosa_version, permute_memory_to_nhwc, custom_path - ).build() + return get_tosa_compile_spec_unbuilt(tosa_version, custom_path).build() def get_tosa_compile_spec_unbuilt( - tosa_version: str, permute_memory_to_nhwc=False, custom_path=None + tosa_version: str, custom_path=None ) -> ArmCompileSpecBuilder: """Get the ArmCompileSpecBuilder for the default TOSA tests, to modify the compile spec before calling .build() to finalize it. @@ -81,7 +77,6 @@ def get_tosa_compile_spec_unbuilt( compile_spec_builder = ( ArmCompileSpecBuilder() .tosa_compile_spec(tosa_version) - .set_permute_memory_format(permute_memory_to_nhwc) .dump_intermediate_artifacts_to(custom_path) ) @@ -89,7 +84,6 @@ def get_tosa_compile_spec_unbuilt( def get_u55_compile_spec( - permute_memory_to_nhwc=True, quantize_io=False, custom_path=None, reorder_inputs=None, @@ -98,7 +92,6 @@ def get_u55_compile_spec( Default compile spec for Ethos-U55 tests. """ return get_u55_compile_spec_unbuilt( - permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path, reorder_inputs=reorder_inputs, @@ -106,7 +99,6 @@ def get_u55_compile_spec( def get_u85_compile_spec( - permute_memory_to_nhwc=True, quantize_io=False, custom_path=None, reorder_inputs=None, @@ -115,7 +107,6 @@ def get_u85_compile_spec( Default compile spec for Ethos-U85 tests. """ return get_u85_compile_spec_unbuilt( - permute_memory_to_nhwc, quantize_io=quantize_io, custom_path=custom_path, reorder_inputs=reorder_inputs, @@ -123,7 +114,6 @@ def get_u85_compile_spec( def get_u55_compile_spec_unbuilt( - permute_memory_to_nhwc=True, quantize_io=False, custom_path=None, reorder_inputs=None, @@ -143,7 +133,6 @@ def get_u55_compile_spec_unbuilt( extra_flags="--debug-force-regor --output-format=raw", ) .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) - .set_permute_memory_format(permute_memory_to_nhwc) .dump_intermediate_artifacts_to(artifact_path) .set_input_order(reorder_inputs) ) @@ -151,7 +140,6 @@ def get_u55_compile_spec_unbuilt( def get_u85_compile_spec_unbuilt( - permute_memory_to_nhwc=True, quantize_io=False, custom_path=None, reorder_inputs=None, @@ -169,7 +157,6 @@ def get_u85_compile_spec_unbuilt( extra_flags="--output-format=raw", ) .set_quantize_io(is_option_enabled("quantize_io") or quantize_io) - .set_permute_memory_format(permute_memory_to_nhwc) .dump_intermediate_artifacts_to(artifact_path) .set_input_order(reorder_inputs) ) diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index b5ff882537..b2fc271ade 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -111,7 +111,6 @@ def test_numerical_diff_prints(self): example_inputs=model.get_inputs(), compile_spec=common.get_tosa_compile_spec( "TOSA-0.80+MI", - permute_memory_to_nhwc=True, custom_path=tempfile.mkdtemp("diff_print_test"), ), ) diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index fca743a6fa..d29695dedf 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -56,9 +56,7 @@ def test_mv2_tosa_MI(self): ArmTester( self.mv2, example_inputs=self.model_inputs, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"), ) .export() .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config) @@ -71,9 +69,7 @@ def test_mv2_tosa_BI(self): ArmTester( self.mv2, example_inputs=self.model_inputs, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), ) .quantize() .export() @@ -92,7 +88,7 @@ def test_mv2_u55_BI(self): ArmTester( self.mv2, example_inputs=self.model_inputs, - compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True), + compile_spec=common.get_u55_compile_spec(), ) .quantize() .export() @@ -112,7 +108,7 @@ def test_mv2_u85_BI(self): ArmTester( self.mv2, example_inputs=self.model_inputs, - compile_spec=common.get_u85_compile_spec(permute_memory_to_nhwc=True), + compile_spec=common.get_u85_compile_spec(), ) .quantize() .export() diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 24faace007..0aa3c6cba9 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -137,7 +137,7 @@ def test_add_u55_BI(self, test_data: torch.Tensor): test_data = (test_data,) self._test_add_ethos_BI_pipeline( self.Add(), - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), test_data, ) @@ -147,7 +147,7 @@ def test_add_u85_BI(self, test_data: torch.Tensor): test_data = (test_data,) self._test_add_ethos_BI_pipeline( self.Add(), - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), test_data, ) diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py index 27629701c3..bc37fbb136 100644 --- a/backends/arm/test/ops/test_avg_pool.py +++ b/backends/arm/test/ops/test_avg_pool.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -57,9 +57,7 @@ def _test_avgpool2d_tosa_MI_pipeline( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"), ) .export() .check(["torch.ops.aten.avg_pool2d.default"]) @@ -81,7 +79,7 @@ def _test_avgpool2d_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) @@ -155,7 +153,7 @@ def test_avgpool2d_tosa_u55_BI( ): self._test_avgpool2d_tosa_ethos_BI_pipeline( self.AvgPool2d(*model_params), - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), (test_data,), ) @@ -169,6 +167,6 @@ def test_avgpool2d_tosa_u85_BI( ): self._test_avgpool2d_tosa_ethos_BI_pipeline( self.AvgPool2d(*model_params), - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), (test_data,), ) diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py index 0b830fa46b..06470d91e8 100644 --- a/backends/arm/test/ops/test_bmm.py +++ b/backends/arm/test/ops/test_bmm.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -159,7 +159,7 @@ def test_bmm_u55_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor) self.BMM(), common.get_u55_compile_spec(), test_data ) - @parameterized.expand(BMM.test_parameters[:1]) + @parameterized.expand(BMM.test_parameters) @pytest.mark.corstone_fvp def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) @@ -167,15 +167,6 @@ def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): self.BMM(), common.get_u85_compile_spec(), test_data ) - @parameterized.expand(BMM.test_parameters[1:]) - @pytest.mark.corstone_fvp - @conftest.expectedFailureOnFVP - def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor): - test_data = (operand1, operand2) - self._test_bmm_ethosu_BI_pipeline( - self.BMM(), common.get_u85_compile_spec(), test_data - ) - # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy @parameterized.expand(BMMSingleInput.test_parameters) @pytest.mark.corstone_fvp diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py index 593260ac56..b754a91f36 100644 --- a/backends/arm/test/ops/test_conv1d.py +++ b/backends/arm/test/ops/test_conv1d.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -228,7 +228,7 @@ def _test_conv1d_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -250,7 +250,7 @@ def _test_conv1d_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() @@ -291,18 +291,13 @@ def test_conv1d_tosa_MI(self, test_name, model): def test_conv1d_tosa_BI(self, test_name, model): self._test_conv1d_tosa_BI_pipeline(model, model.get_inputs()) - # Expeted to fail as Conv1D requires transpoes which isn't supported on u55 @parameterized.expand(testsuite) @pytest.mark.corstone_fvp - @unittest.expectedFailure def test_conv1d_u55_BI(self, test_name, model): self._test_conv1d_ethosu_BI_pipeline( model, common.get_u55_compile_spec(), model.get_inputs() ) - # This specific test case has numerical errors on FVP, MLETORCH-520. - testsuite.remove(("5_3x2x128_st1", conv1d_5_3x2x128_st1)) - @parameterized.expand(testsuite) @pytest.mark.corstone_fvp def test_conv1d_u85_BI(self, test_name, model): diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py index 9ccac53940..bbcb421ce7 100644 --- a/backends/arm/test/ops/test_conv2d.py +++ b/backends/arm/test/ops/test_conv2d.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -255,7 +255,7 @@ def _test_conv2d_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -277,7 +277,7 @@ def _test_conv2d_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() @@ -330,7 +330,7 @@ def test_conv2d_tosa_BI(self, test_name, model): @pytest.mark.corstone_fvp def test_conv2d_u55_BI(self, test_name, model): self._test_conv2d_ethosu_BI_pipeline( - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), model, model.get_inputs(), ) @@ -339,7 +339,7 @@ def test_conv2d_u55_BI(self, test_name, model): @pytest.mark.corstone_fvp def test_conv2d_u85_BI(self, test_name, model): self._test_conv2d_ethosu_BI_pipeline( - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), model, model.get_inputs(), ) diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 4a5615f97c..8352727a1c 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -194,7 +194,7 @@ def _test_conv_combo_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -218,7 +218,7 @@ def _test_conv_combo_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() @@ -273,7 +273,7 @@ def test_conv_meandim_u55_BI(self): model = ComboConv2dMeandim() self._test_conv_combo_ethos_BI_pipeline( model, - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), model.get_inputs(), ) @@ -282,7 +282,7 @@ def test_conv_meandim_u85_BI(self): model = ComboConv2dMeandim() self._test_conv_combo_ethos_BI_pipeline( model, - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), model.get_inputs(), ) @@ -334,7 +334,7 @@ def test_conv_relu6_u55_BI(self, test_data: torch.Tensor): model = ComboConvRelu6() test_data = (test_data,) self._test_conv_combo_ethos_BI_pipeline( - model, common.get_u55_compile_spec(permute_memory_to_nhwc=True), test_data + model, common.get_u55_compile_spec(), test_data ) @parameterized.expand(ComboConvRelu6.test_data) @@ -343,7 +343,7 @@ def test_conv_relu6_u85_BI(self, test_data: torch.Tensor): model = ComboConvRelu6() test_data = (test_data,) self._test_conv_combo_ethos_BI_pipeline( - model, common.get_u85_compile_spec(permute_memory_to_nhwc=True), test_data + model, common.get_u85_compile_spec(), test_data ) ############################### @@ -364,7 +364,7 @@ def test_block_bottleneck_residual_u55_BI(self): model = ComboBlockBottleneckResidual() self._test_conv_combo_ethos_BI_pipeline( model, - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), model.get_inputs(), ) @@ -373,7 +373,7 @@ def test_block_bottleneck_residual_u85_BI(self): model = ComboBlockBottleneckResidual() self._test_conv_combo_ethos_BI_pipeline( model, - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), model.get_inputs(), ) diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 3ce7584086..e183dcc9c6 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -152,9 +152,9 @@ testsuite_conv2d = [ ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1), ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1), + ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias), ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1), ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3), - ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias), ("two_dw_conv2d", two_dw_conv2d), ] @@ -191,7 +191,7 @@ def _test_dw_conv_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -211,7 +211,7 @@ def _test_dw_conv_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() @@ -257,37 +257,37 @@ def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module): def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs()) - testsuite_conv2d.remove( - ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1) - ) # Works - - @parameterized.expand(testsuite_conv2d, skip_on_empty=True) + @parameterized.expand(testsuite_conv2d[:4], skip_on_empty=True) @pytest.mark.corstone_fvp - @unittest.expectedFailure def test_dw_conv2d_u55_BI( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u55_compile_spec( - permute_memory_to_nhwc=True, quantize_io=set_quantize_io - ), + common.get_u55_compile_spec(quantize_io=set_quantize_io), + model.get_inputs(), + ) + + @parameterized.expand(testsuite_conv2d[4:], skip_on_empty=True) + @pytest.mark.corstone_fvp + @unittest.expectedFailure # TODO: MLETORCH-516 + def test_dw_conv2d_u55_BI_xfails( + self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False + ): + self._test_dw_conv_ethos_BI_pipeline( + model, + common.get_u55_compile_spec(quantize_io=set_quantize_io), model.get_inputs(), ) - # Expected to fail as conv1d needs transpose which is not supported - # on u55. @parameterized.expand(testsuite_conv1d, skip_on_empty=True) @pytest.mark.corstone_fvp - @unittest.expectedFailure def test_dw_conv1d_u55_BI( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u55_compile_spec( - permute_memory_to_nhwc=True, quantize_io=set_quantize_io - ), + common.get_u55_compile_spec(quantize_io=set_quantize_io), model.get_inputs(), ) @@ -298,9 +298,7 @@ def test_dw_conv_u85_BI( ): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u85_compile_spec( - permute_memory_to_nhwc=True, quantize_io=set_quantize_io - ), + common.get_u85_compile_spec(quantize_io=set_quantize_io), model.get_inputs(), ) @@ -313,8 +311,6 @@ def test_dw_conv_u85_BI_xfails( ): self._test_dw_conv_ethos_BI_pipeline( model, - common.get_u85_compile_spec( - permute_memory_to_nhwc=True, quantize_io=set_quantize_io - ), + common.get_u85_compile_spec(quantize_io=set_quantize_io), model.get_inputs(), ) diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index d5f6174469..062dbfacae 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -185,7 +185,7 @@ def test_div_tosa_BI( test_data = (input_, other_) self._test_div_tosa_BI_pipeline(self.Div(), test_data) - @parameterized.expand(test_data_suite[:2]) + @parameterized.expand(test_data_suite[:3]) @pytest.mark.corstone_fvp def test_div_u55_BI( self, @@ -200,7 +200,7 @@ def test_div_u55_BI( ) # Numerical issues on FVP likely due to mul op, MLETORCH-521 - @parameterized.expand(test_data_suite[2:]) + @parameterized.expand(test_data_suite[3:]) @pytest.mark.corstone_fvp @conftest.expectedFailureOnFVP def test_div_u55_BI_xfails( @@ -215,7 +215,7 @@ def test_div_u55_BI_xfails( self.Div(), common.get_u55_compile_spec(), test_data ) - @parameterized.expand(test_data_suite[:2]) + @parameterized.expand(test_data_suite[:3]) @pytest.mark.corstone_fvp def test_div_u85_BI( self, @@ -230,7 +230,7 @@ def test_div_u85_BI( ) # Numerical issues on FVP likely due to mul op, MLETORCH-521 - @parameterized.expand(test_data_suite[2:]) + @parameterized.expand(test_data_suite[3:]) @pytest.mark.corstone_fvp @conftest.expectedFailureOnFVP def test_div_u85_BI_xfails( diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py index 1b6d6e6ae3..fc82fa4dd7 100644 --- a/backends/arm/test/ops/test_full.py +++ b/backends/arm/test/ops/test_full.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -76,15 +76,12 @@ def _test_full_tosa_BI_pipeline( self, module: torch.nn.Module, test_data: Tuple, - permute_memory_to_nhwc: bool, ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=permute_memory_to_nhwc - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), ) .quantize() .export() @@ -134,7 +131,7 @@ def test_const_full_tosa_MI(self): def test_const_full_nhwc_tosa_BI(self): _input = torch.rand((2, 2, 3, 3)) * 10 - self._test_full_tosa_BI_pipeline(self.AddConstFull(), (_input,), True) + self._test_full_tosa_BI_pipeline(self.AddConstFull(), (_input,)) @parameterized.expand(AddVariableFull.test_parameters) def test_full_tosa_MI(self, test_tensor: Tuple): @@ -144,7 +141,7 @@ def test_full_tosa_MI(self, test_tensor: Tuple): @parameterized.expand(AddVariableFull.test_parameters) def test_full_tosa_BI(self, test_tensor: Tuple): - self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor, False) + self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor) # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(AddVariableFull.test_parameters) diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py index 2d88421fb5..0570afc03e 100644 --- a/backends/arm/test/ops/test_layer_norm.py +++ b/backends/arm/test/ops/test_layer_norm.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -78,7 +78,7 @@ def _test_layernorm_tosa_MI_pipeline( model=module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -99,7 +99,7 @@ def _test_layernorm_tosa_BI_pipeline( model=module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() @@ -158,9 +158,21 @@ def test_layer_norm_tosa_BI( self.LayerNorm(*model_params), (test_data,) ) + @parameterized.expand(test_data_suite[4:]) + @pytest.mark.corstone_fvp + def test_layer_norm_u55_BI( + self, + test_name: str, + test_data: torch.Tensor, + model_params, + ): + self._test_layernorm_ethosu_BI_pipeline( + self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,) + ) + # Numerical issues on FVP likely due to mul op, MLETORCH-521 # Skip tests that require transposes. - @parameterized.expand(test_data_suite) + @parameterized.expand(test_data_suite[:4]) @pytest.mark.corstone_fvp @unittest.expectedFailure def test_layer_norm_u55_BI_xfails( diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index cd14b7801d..825b2f9bc9 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -137,7 +137,7 @@ def _test_linear_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -157,7 +157,7 @@ def _test_linear_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py index 69c8ee06ec..d1581423a0 100644 --- a/backends/arm/test/ops/test_logsoftmax.py +++ b/backends/arm/test/ops/test_logsoftmax.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -31,10 +31,6 @@ ("ones", torch.ones(10, 10), 1), ("ones_neg_dim", torch.ones(10, 3, 4), -1), ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3), -] - -test_data_suite_u55_xfails = [ - # (test_name, test_data, dim) ("zeros", torch.zeros(10, 8, 5, 2), 0), ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4), ("rand", torch.rand(1, 2, 5, 8), 2), @@ -161,19 +157,6 @@ def test_logsoftmax_tosa_u55_BI( self.LogSoftmax(dim=dim), (test_data,) ) - # Expected to fail as this is not supported on u55. - @parameterized.expand(test_data_suite_u55_xfails) - @unittest.expectedFailure - def test_logsoftmax_tosa_u55_BI_xfails( - self, - test_name: str, - test_data: torch.Tensor, - dim: int, - ): - self._test_logsoftmax_tosa_u55_BI_pipeline( - self.LogSoftmax(dim=dim), (test_data,) - ) - @parameterized.expand(test_data_suite) def test_logsoftmax_tosa_u85_BI( self, diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py index a693c7d549..81f27beab4 100644 --- a/backends/arm/test/ops/test_max_pool.py +++ b/backends/arm/test/ops/test_max_pool.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -65,7 +65,7 @@ def _test_maxpool2d_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -92,7 +92,7 @@ def _test_maxpool2d_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) @@ -171,7 +171,7 @@ def test_maxpool2d_tosa_u55_BI( ): tester = self._test_maxpool2d_tosa_ethos_BI_pipeline( self.MaxPool2d(*model_params), - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), (test_data,), ) if conftest.is_option_enabled("corstone_fvp"): @@ -189,7 +189,7 @@ def test_maxpool2d_tosa_u85_BI( ): tester = self._test_maxpool2d_tosa_ethos_BI_pipeline( self.MaxPool2d(*model_params), - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), (test_data,), ) if conftest.is_option_enabled("corstone_fvp"): @@ -230,7 +230,7 @@ def test_maxpool2d_tosa_u55_BI_mult_batches( ): tester = self._test_maxpool2d_tosa_ethos_BI_pipeline( self.MaxPool2d(*model_params), - common.get_u55_compile_spec(permute_memory_to_nhwc=True), + common.get_u55_compile_spec(), (test_data,), ) if conftest.is_option_enabled("corstone_fvp"): @@ -249,7 +249,7 @@ def test_maxpool2d_tosa_u85_BI_mult_batches( ): tester = self._test_maxpool2d_tosa_ethos_BI_pipeline( self.MaxPool2d(*model_params), - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), (test_data,), ) if conftest.is_option_enabled("corstone_fvp"): diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index e4f6afcbd6..393cf1667e 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -269,10 +269,8 @@ def test_meandim_tosa_BI( ): self._test_meandim_tosa_BI_pipeline(self.MeanDim(dim, keepdim), (test_data,)) - # Expected to fail as this is not supported on u55. @parameterized.expand(MeanDim.test_data_suite) - @unittest.expectedFailure - def test_meandim_tosa_u55_BI_xfails( + def test_meandim_tosa_u55_BI( self, test_name: str, test_data: torch.Tensor, diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index 9d789a8e33..715673b87c 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -73,7 +73,7 @@ def _test_mul_tosa_MI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=True + "TOSA-0.80+MI", ), ) .export() @@ -94,7 +94,7 @@ def _test_mul_tosa_BI_pipeline( module, example_inputs=test_data, compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=True + "TOSA-0.80+BI", ), ) .quantize() diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index b373af1401..ec7ecaa81b 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -53,15 +53,12 @@ def _test_permute_tosa_MI_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.tensor], - permute_memory_to_nhwc: bool, ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=permute_memory_to_nhwc - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"), ) .export() .check(["torch.ops.aten.permute.default"]) @@ -127,10 +124,8 @@ def _test_permute_ethos_BI_pipeline( def test_permute_tosa_MI( self, test_name: str, test_data: torch.Tensor, dims: list[int] ): - self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,), True) - self._test_permute_tosa_MI_pipeline( - self.Permute(dims=dims), (test_data,), False - ) + self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,)) + self._test_permute_tosa_MI_pipeline(self.Permute(dims=dims), (test_data,)) @parameterized.expand(test_data_suite) def test_permute_tosa_BI( @@ -141,7 +136,6 @@ def test_permute_tosa_BI( # Expected to fail as TOSA.Transpose is not supported by Ethos-U55. @parameterized.expand(test_data_suite[0:1]) @pytest.mark.corstone_fvp - @unittest.expectedFailure def test_permute_u55_BI( self, test_name: str, test_data: torch.Tensor, dims: list[int] ): diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py index f43f7af13c..bad872792b 100644 --- a/backends/arm/test/ops/test_repeat.py +++ b/backends/arm/test/ops/test_repeat.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -107,20 +107,12 @@ def test_repeat_tosa_MI(self, test_input, multiples): def test_repeat_tosa_BI(self, test_input, multiples): self._test_repeat_tosa_BI_pipeline(self.Repeat(), (test_input, multiples)) - @parameterized.expand(Repeat.test_parameters[:-1]) + @parameterized.expand(Repeat.test_parameters) def test_repeat_u55_BI(self, test_input, multiples): self._test_repeat_ethosu_pipeline( common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples) ) - # Final test requires transpose which is not supported on u55. - @parameterized.expand(Repeat.test_parameters[-1:]) - @unittest.expectedFailure - def test_repeat_u55_BI_xfails(self, test_input, multiples): - self._test_repeat_ethosu_pipeline( - common.get_u55_compile_spec(), self.Repeat(), (test_input, multiples) - ) - @parameterized.expand(Repeat.test_parameters) def test_repeat_u85_BI(self, test_input, multiples): self._test_repeat_ethosu_pipeline( diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py index c39b20a731..b474da573f 100644 --- a/backends/arm/test/ops/test_select.py +++ b/backends/arm/test/ops/test_select.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -51,21 +51,19 @@ def _test_select_tosa_MI_pipeline( test_data: test_data_t, export_target: str, ): - # For 4D tensors, do not permute to NHWC - permute = False if len(test_data[0].shape) == 4 else True ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+MI", permute_memory_to_nhwc=permute - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"), ) .export() .check([export_target]) .check_not(["torch.ops.quantized_decomposed"]) .to_edge() + .dump_artifact() .partition() + .dump_artifact() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() .run_method_and_compare_outputs(inputs=test_data) @@ -77,15 +75,11 @@ def _test_select_tosa_BI_pipeline( test_data: test_data_t, export_target: str, ): - # For 4D tensors, do not permute to NHWC - permute = False if len(test_data[0].shape) == 4 else True ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=permute - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), ) .quantize() .export() @@ -124,10 +118,8 @@ def _test_select_ethos_BI_pipeline( def _test_select_tosa_u55_BI_pipeline( self, module: torch.nn.Module, test_data: test_data_t, export_target: str ): - # For 4D tensors, do not permute to NHWC - permute = False if len(test_data[0].shape) == 4 else True self._test_select_ethos_BI_pipeline( - common.get_u55_compile_spec(permute_memory_to_nhwc=permute), + common.get_u55_compile_spec(), module, test_data, export_target, @@ -136,10 +128,8 @@ def _test_select_tosa_u55_BI_pipeline( def _test_select_tosa_u85_BI_pipeline( self, module: torch.nn.Module, test_data: test_data_t, export_target: str ): - # For 4D tensors, do not permute to NHWC - permute = False if len(test_data[0].shape) == 4 else True self._test_select_ethos_BI_pipeline( - common.get_u85_compile_spec(permute_memory_to_nhwc=permute), + common.get_u85_compile_spec(), module, test_data, export_target, diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py index 511873a8c2..7cb82e3a82 100644 --- a/backends/arm/test/ops/test_slice.py +++ b/backends/arm/test/ops/test_slice.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -52,16 +52,14 @@ def _test_slice_tosa_MI_pipeline( ) def _test_slice_tosa_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor], permute: bool + self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] ): ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_tosa_compile_spec( - "TOSA-0.80+BI", permute_memory_to_nhwc=permute - ), + compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"), ) .quantize() .export() @@ -114,11 +112,11 @@ def test_slice_tosa_MI(self, tensor): @parameterized.expand(Slice.test_tensors[:2]) def test_slice_nchw_tosa_BI(self, test_tensor: torch.Tensor): - self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), False) + self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,)) @parameterized.expand(Slice.test_tensors[2:]) def test_slice_nhwc_tosa_BI(self, test_tensor: torch.Tensor): - self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,), True) + self._test_slice_tosa_BI_pipeline(self.Slice(), (test_tensor,)) @parameterized.expand(Slice.test_tensors) def test_slice_u55_BI(self, test_tensor: torch.Tensor): diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index fd78d1a9ac..794f6b791f 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -33,10 +33,6 @@ ("ones", torch.ones(10, 10), 1), ("ones_neg_dim", torch.ones(10, 3, 4), -1), ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3), -] - -test_data_suite_u55_xfails = [ - # (test_name, test_data, dim) ("zeros", torch.zeros(10, 8, 5, 2), 0), ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4), ("rand", torch.rand(1, 2, 5, 8), 2), @@ -161,17 +157,6 @@ def test_softmax_tosa_u55_BI( ): self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,)) - # Expected to fail as this is not supported on u55. - @parameterized.expand(test_data_suite_u55_xfails) - @unittest.expectedFailure - def test_softmax_tosa_u55_BI_xfails( - self, - test_name: str, - test_data: torch.Tensor, - dim: int, - ): - self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,)) - @parameterized.expand(test_data_suite) def test_softmax_tosa_u85_BI( self, diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py index ac26fd73fa..9f02392e1e 100644 --- a/backends/arm/test/ops/test_squeeze.py +++ b/backends/arm/test/ops/test_squeeze.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -136,7 +136,7 @@ def test_squeeze_u55_BI( test_tensor: torch.Tensor, ): self._test_squeeze_ethosu_BI_pipeline( - common.get_u55_compile_spec(permute_memory_to_nhwc=False), + common.get_u55_compile_spec(), self.Squeeze(), (test_tensor,), "torch.ops.aten.squeeze.default", @@ -148,7 +148,7 @@ def test_squeeze_u85_BI( test_tensor: torch.Tensor, ): self._test_squeeze_ethosu_BI_pipeline( - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), self.Squeeze(), (test_tensor,), "torch.ops.aten.squeeze.default", @@ -169,7 +169,7 @@ def test_squeeze_dim_tosa_BI(self, test_tensor: torch.Tensor, dim: int): @parameterized.expand(SqueezeDim.test_parameters) def test_squeeze_dim_u55_BI(self, test_tensor: torch.Tensor, dim: int): self._test_squeeze_ethosu_BI_pipeline( - common.get_u55_compile_spec(permute_memory_to_nhwc=False), + common.get_u55_compile_spec(), self.SqueezeDim(), (test_tensor, dim), "torch.ops.aten.squeeze.dim", @@ -178,7 +178,7 @@ def test_squeeze_dim_u55_BI(self, test_tensor: torch.Tensor, dim: int): @parameterized.expand(SqueezeDim.test_parameters) def test_squeeze_dim_u85_BI(self, test_tensor: torch.Tensor, dim: int): self._test_squeeze_ethosu_BI_pipeline( - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), self.SqueezeDim(), (test_tensor, dim), "torch.ops.aten.squeeze.dim", @@ -199,7 +199,7 @@ def test_squeeze_dims_tosa_BI(self, test_tensor: torch.Tensor, dims: tuple[int]) @parameterized.expand(SqueezeDims.test_parameters) def test_squeeze_dims_u55_BI(self, test_tensor: torch.Tensor, dims: tuple[int]): self._test_squeeze_ethosu_BI_pipeline( - common.get_u55_compile_spec(permute_memory_to_nhwc=False), + common.get_u55_compile_spec(), self.SqueezeDims(), (test_tensor, dims), "torch.ops.aten.squeeze.dims", diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py index 098e0fd1bc..7f85cba4c3 100644 --- a/backends/arm/test/ops/test_sum.py +++ b/backends/arm/test/ops/test_sum.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -39,9 +39,6 @@ class Sum(torch.nn.Module): ((torch.rand(10), 0, True),), ((torch.rand(10, 10), 1, False),), ((torch.rand(1, 2, 3, 4), 3, True),), - ] - - test_parameters_u55_xfails: list[Tuple[exampledata_t]] = [ ((torch.rand(10, 10, 10), [-3, 1], True),), ((torch.rand(2, 1, 5, 8), 1, False),), ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),), @@ -129,17 +126,7 @@ def test_sum_u55_BI(self, test_data: tuple[exampledata_t]): self._test_sum_ethosu_BI_pipeline( self.Sum(), test_data, - common.get_u55_compile_spec(permute_memory_to_nhwc=False), - ) - - # Expected to fail as this is not supported on u55. - @parameterized.expand(Sum.test_parameters_u55_xfails) - @unittest.expectedFailure - def test_sum_u55_BI_xfails(self, test_data: tuple[exampledata_t]): - self._test_sum_ethosu_BI_pipeline( - self.Sum(), - test_data, - common.get_u55_compile_spec(permute_memory_to_nhwc=False), + common.get_u55_compile_spec(), ) @parameterized.expand(Sum.test_parameters) @@ -147,5 +134,5 @@ def test_sum_u85_BI(self, test_data: tuple[exampledata_t]): self._test_sum_ethosu_BI_pipeline( self.Sum(), test_data, - common.get_u85_compile_spec(permute_memory_to_nhwc=True), + common.get_u85_compile_spec(), ) diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py index a6faf70af0..68f4fe4612 100644 --- a/backends/arm/test/ops/test_unsqueeze.py +++ b/backends/arm/test/ops/test_unsqueeze.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -95,7 +95,7 @@ def test_unsqueeze_tosa_MI(self, test_tensor: torch.Tensor): def test_unsqueeze_tosa_BI(self, test_tensor: torch.Tensor): self._test_unsqueeze_tosa_BI_pipeline(self.Unsqueeze(), (test_tensor, 0)) - @parameterized.expand(Unsqueeze.test_parameters[:-1]) + @parameterized.expand(Unsqueeze.test_parameters) def test_unsqueeze_u55_BI(self, test_tensor: torch.Tensor): self._test_unsqueeze_ethosu_BI_pipeline( common.get_u55_compile_spec(), diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py index 322ac5b0ed..e1fed05817 100644 --- a/backends/arm/test/ops/test_var.py +++ b/backends/arm/test/ops/test_var.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -158,10 +158,8 @@ def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction): def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction): self._test_var_tosa_BI_pipeline(self.Var(), (test_tensor, keepdim, correction)) - # Expected to fail as this is not supported on u55. @parameterized.expand(Var.test_parameters) - @unittest.expectedFailure - def test_var_u55_BI_xfails(self, test_tensor: torch.Tensor, keepdim, correction): + def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction): self._test_var_ethosu_BI_pipeline( self.Var(), common.get_u55_compile_spec(), @@ -196,18 +194,6 @@ def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correctio (test_tensor, dim, keepdim, correction), ) - # Expected to fail as this is not supported on u55. - @parameterized.expand(VarDim.test_parameters_u55_xfails) - @unittest.expectedFailure - def test_var_dim_u55_BI_xfails( - self, test_tensor: torch.Tensor, dim, keepdim, correction - ): - self._test_var_ethosu_BI_pipeline( - self.VarDim(), - common.get_u55_compile_spec(), - (test_tensor, dim, keepdim, correction), - ) - @parameterized.expand(VarDim.test_parameters) def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction): self._test_var_ethosu_BI_pipeline( @@ -232,10 +218,8 @@ def test_var_correction_tosa_BI( self.VarCorrection(), (test_tensor, dim, keepdim, correction) ) - # Expected to fail as this is not supported on u55. @parameterized.expand(VarCorrection.test_parameters) - @unittest.expectedFailure - def test_var_correction_u55_BI_xfails( + def test_var_correction_u55_BI( self, test_tensor: torch.Tensor, dim, keepdim, correction ): self._test_var_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index 1603a2a37d..f90ae40206 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -129,15 +129,10 @@ def test_view_tosa_MI(self, test_tensor: torch.Tensor, new_shape): def test_view_tosa_BI(self, test_tensor: torch.Tensor, new_shape): self._test_view_tosa_BI_pipeline(self.View(), (test_tensor, new_shape)) - @parameterized.expand(View.no_transpose_tests) + @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests) def test_view_u55_BI(self, test_tensor: torch.Tensor, new_shape): self._test_view_u55_BI_pipeline(self.View(), (test_tensor, new_shape)) - @parameterized.expand(View.needs_transpose_tests) - @unittest.expectedFailure - def test_view_transpose_u55_BI(self, test_tensor: torch.Tensor, new_shape): - self._test_view_u55_BI_pipeline(self.View(), (test_tensor, new_shape)) - @parameterized.expand(View.needs_transpose_tests + View.no_transpose_tests) def test_view_u85_BI(self, test_tensor: torch.Tensor, new_shape): self._test_view_u85_BI_pipeline(self.View(), (test_tensor, new_shape)) diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index b3f5b4f05b..2b65c306be 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -1,4 +1,4 @@ -# Copyright 2024 Arm Limited and/or its affiliates. +# Copyright 2024-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -15,7 +15,7 @@ import torch.fx -from executorch.backends.arm.arm_backend import get_intermediate_path, is_permute_memory +from executorch.backends.arm.arm_backend import get_intermediate_path from executorch.backends.arm.arm_partitioner import ArmPartitioner from executorch.backends.arm.quantizer.arm_quantizer import ( ArmQuantizer, @@ -329,7 +329,6 @@ def run_method_and_compare_outputs( logger.info( f"Comparing Stage '{self.stage_name(test_stage)}' with Stage '{self.stage_name(reference_stage)}'" ) - is_nhwc = is_permute_memory(self.compile_spec) # Loop inputs and compare reference stage with the compared stage. for run_iteration in range(num_runs): @@ -344,10 +343,7 @@ def run_method_and_compare_outputs( if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor): test_input.extend([tensor.clone() for tensor in arg]) - if ( - is_nhwc - and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] - ): + if test_stage == self.stages[self.stage_name(tester.ToExecutorch)]: test_input = self.transpose_data_format(test_input, "NHWC") input_shapes = [ @@ -359,10 +355,7 @@ def run_method_and_compare_outputs( reference_output = reference_stage.run_artifact(reference_input) test_output = test_stage.run_artifact(test_input) - if ( - is_nhwc - and test_stage == self.stages[self.stage_name(tester.ToExecutorch)] - ): + if test_stage == self.stages[self.stage_name(tester.ToExecutorch)]: test_output = self.transpose_data_format(test_output, "NCHW") self._compare_outputs( diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt index 6c71909c47..e2ac3de5ca 100644 --- a/backends/cadence/CMakeLists.txt +++ b/backends/cadence/CMakeLists.txt @@ -60,9 +60,6 @@ if(EXECUTORCH_CADENCE_CPU_RUNNER) ${_common_include_directories} ) - set(TARGET_DIR reference) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) - target_link_libraries( cadence_runner executorch @@ -78,12 +75,12 @@ endif() if(EXECUTORCH_NNLIB_OPT) set(TARGET_DIR hifi) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) -endif() - -if(EXECUTORCH_FUSION_G3_OPT) +elseif(EXECUTORCH_FUSION_G3_OPT) set(TARGET_DIR fusion_g3) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib) +else() + set(TARGET_DIR reference) endif() +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators) diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp index be866b2f51..289baceb12 100644 --- a/backends/cadence/fusion_g3/operators/op_mean.cpp +++ b/backends/cadence/fusion_g3/operators/op_mean.cpp @@ -59,7 +59,7 @@ int prepare_data( return num_axis_dims; } -Tensor& mean_dim_out( +Tensor& mean_out( KernelRuntimeContext& ctx, const Tensor& in, optional> dim_list, @@ -199,4 +199,4 @@ Tensor& mean_dim_out( } // namespace native } // namespace G3 } // namespace impl -} // namespace cadence \ No newline at end of file +} // namespace cadence diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 3d5a432431..699e064669 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -51,6 +51,11 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary( } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { num_graphs = binaryinfo->contextBinaryInfoV2.numGraphs; graphs = binaryinfo->contextBinaryInfoV2.graphs; +#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) + } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) { + num_graphs = binaryinfo->contextBinaryInfoV3.numGraphs; + graphs = binaryinfo->contextBinaryInfoV3.graphs; +#endif } else { QNN_EXECUTORCH_LOG_WARN( "Unknown QNN BinaryInfo version %d.", binaryinfo->version); @@ -62,6 +67,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary( RetrieveGraphInfo(graphs[i].graphInfoV1); } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_2) { RetrieveGraphInfo(graphs[i].graphInfoV2); +#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) + } else if (graphs->version == QNN_SYSTEM_CONTEXT_GRAPH_INFO_VERSION_3) { + RetrieveGraphInfo(graphs[i].graphInfoV3); +#endif } else { QNN_EXECUTORCH_LOG_WARN( "Unknown QNN GraphInfo version %d.", binaryinfo->version); diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp index 757034baa8..030b5666da 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp @@ -17,6 +17,9 @@ using executorch::runtime::Error; Error HtpBackendCache::RetrieveBackendBinaryInfo( const QnnSystemContext_BinaryInfo_t* binaryinfo) { QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr; +#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) + QnnHtpSystemContext_GraphBlobInfo_t* htp_graphblobinfo = nullptr; +#endif if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) { htp_hwblobinfo = static_cast( @@ -24,27 +27,43 @@ Error HtpBackendCache::RetrieveBackendBinaryInfo( } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) { htp_hwblobinfo = static_cast( binaryinfo->contextBinaryInfoV2.hwInfoBlob); +#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) + } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_3) { + htp_graphblobinfo = static_cast( + binaryinfo->contextBinaryInfoV3.graphs->graphInfoV3.graphBlobInfo); +#endif } else { QNN_EXECUTORCH_LOG_WARN( "Unknown QNN BinaryInfo version %d.", binaryinfo->version); return Error::Internal; } - if (htp_hwblobinfo == nullptr) { - QNN_EXECUTORCH_LOG_WARN( - "Htp hardware blob information is not found in binary information."); - return Error::Ok; + if (htp_hwblobinfo) { + if (htp_hwblobinfo->version == + QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) { + spill_fill_buf_ = + (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize; + } else { + QNN_EXECUTORCH_LOG_WARN( + "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version); + return Error::Internal; + } } - if (htp_hwblobinfo->version == - QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) { - spill_fill_buf_ = - (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize; - } else { - QNN_EXECUTORCH_LOG_WARN( - "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version); - return Error::Internal; +#if (QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 21) + if (htp_graphblobinfo) { + if (htp_graphblobinfo->version == + QNN_SYSTEM_CONTEXT_HTP_GRAPH_INFO_BLOB_VERSION_V1) { + spill_fill_buf_ = + (*htp_graphblobinfo).contextBinaryGraphBlobInfoV1.spillFillBufferSize; + } else { + QNN_EXECUTORCH_LOG_WARN( + "Unknown QNN Htp graph blob info version %d.", + htp_graphblobinfo->version); + return Error::Internal; + } } +#endif return Error::Ok; } diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 900854ccd7..8c76c11532 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -478,13 +478,6 @@ vTensor::vTensor( if (storage_type != utils::kBuffer) { set_logical_limits(storage_.image_extents_); } - - if (dtype == vkapi::kHalf) { - VK_CHECK_COND( - api::context()->adapter_ptr()->supports_16bit_storage_buffers(), - "Half dtype is only available if the physical device supports float16 " - "storage buffers!"); - } } // NOLINTNEXTLINE diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index e27723468a..bf501296b1 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -68,6 +68,8 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { void PrepackNode::encode(ComputeGraph* graph) { api::Context* const context = graph->context(); + context->check_device_capabilities(shader_); + vTensorPtr packed = graph->get_tensor(packed_); api::StagingBuffer staging = create_staging_buffer(graph); diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index ad4ff245a1..cd385718ce 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -14,6 +14,8 @@ #define TILE_SIZE ${TILE_SIZE} +#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION} + #define BATCH_SIZE_X ${BATCH_SIZE_X} #define BATCH_SIZE_Y ${BATCH_SIZE_Y} @@ -40,6 +42,8 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; * Computes a depthwise convolution. Each shader invocation calculates the * output at a single output location. */ + +#if STRIDE_EQ_DILATION void main() { // x and y are divided by batch size to determine 3d position // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z @@ -121,3 +125,42 @@ void main() { } } } + +#else +void main() { + const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x; + const ivec3 pos = ivec3( + gl_GlobalInvocationID.x % out_limits.x, + div_by_x % out_limits.y, + div_by_x / out_limits.y); + + if (any(greaterThanEqual(pos, out_limits))) { + return; + } + + // Compute the index of the top-left element of the overlay region. Negative + // indices indicate that the top-left element is in a region added by padding. + const ivec2 ipos = pos.xy * stride - padding; + + // Compute the start and end of the input indices to load. Padding is assumed + // to be constant 0 padding, so any reads from the padding region is skipped. + const ivec2 start = ipos; + const ivec2 end = ipos + overlay_region.xy; + + VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0); + int kx = 0; + for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) { + for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) { + // The weight kernel was rearranged such that every NxN filter is + // flattened to fit in one row. Each filter was then stacked on top of + // each other vertically. + const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0); + sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum); + kx++; + } + } + + imageStore(t_out, pos, op(sum, out_min, out_max)); +} + +#endif diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml index 9cf6c22c6c..d3672f5ec2 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml @@ -12,6 +12,7 @@ conv2d_dw_output_tile: TILE_SIZE: 3 BATCH_SIZE_X: 4 BATCH_SIZE_Y: 2 + STRIDE_EQ_DILATION: 0 generate_variant_forall: DTYPE: - VALUE: half @@ -25,3 +26,15 @@ conv2d_dw_output_tile: - NAME: conv2d_dw_output_tile_5x5_clamp OPERATOR: clamp(X, A, B) TILE_SIZE: 5 + - NAME: conv2d_dw_sed_output_tile_3x3 + STRIDE_EQ_DILATION: 1 + - NAME: conv2d_dw_sed_output_tile_3x3_clamp + OPERATOR: clamp(X, A, B) + STRIDE_EQ_DILATION: 1 + - NAME: conv2d_dw_sed_output_tile_5x5 + TILE_SIZE: 5 + STRIDE_EQ_DILATION: 1 + - NAME: conv2d_dw_sed_output_tile_5x5_clamp + OPERATOR: clamp(X, A, B) + TILE_SIZE: 5 + STRIDE_EQ_DILATION: 1 diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp index 64c145fb7e..a7c11cc853 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp @@ -126,13 +126,17 @@ vkapi::ShaderInfo get_conv2d_shader( const bool prepack_weights, const Conv2dMethod method, const ValueRef weight, - const bool clamp_out = false) { + const bool clamp_out = false, + const bool stride_equals_dilation = false) { std::string kernel_name; kernel_name.reserve(kShaderNameReserve); switch (method) { case Conv2dMethod::Depthwise: kernel_name = "conv2d_dw"; if (!prepack_weights) { + if (stride_equals_dilation) { + kernel_name += "_sed"; + } const auto& weight_sizes = graph.get_tref(weight)->sizes; if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) { kernel_name += "_output_tile_3x3"; @@ -286,22 +290,37 @@ Conv2dMethod get_conv2d_method( return Conv2dMethod::SlidingWindow; } +utils::uvec2 get_conv2d_dw_dispatch_divisor( + const std::vector& weight_sizes) { + if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) { + return {4u, 2u}; + } + if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) { + return {4u, 2u}; + } + return {4u, 2u}; +} + utils::uvec3 create_conv2d_global_wg_size( ComputeGraph& graph, const Conv2dMethod method, - const ValueRef out) { + const ValueRef out, + const ValueRef weight_data, + const bool stride_equals_dilation) { if (method == Conv2dMethod::Pointwise) { const utils::uvec3 image_extents = graph.logical_limits_of(out); return { utils::div_up(image_extents[0u], 2u), utils::div_up(image_extents[1u], 2u), image_extents[2u]}; - } else if (method == Conv2dMethod::Depthwise) { - const utils::uvec3 image_extents = graph.logical_limits_of(out); + } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) { + const utils::uvec3 image_extents = graph.create_global_wg_size(out); + const utils::uvec2 div = + get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes); return { - utils::div_up(image_extents[0u], 4u), - utils::div_up(image_extents[1u], 2u), - image_extents[2u]}; + utils::div_up(image_extents[0], div[0]), + utils::div_up(image_extents[1], div[1]), + image_extents[2]}; } else { return graph.create_global_wg_size(out); } @@ -364,6 +383,10 @@ void add_conv2d_node( Conv2dParams extra_params = create_conv2d_params(graph, weight_data, kernel_params, transposed_val); + const bool stride_equals_dilation = + (kernel_params.stride[0] == kernel_params.dilation[0] && + kernel_params.stride[1] == kernel_params.dilation[1]); + OutputParams out_params = {out_min_val, out_max_val}; check_conv2d_params(kernel_params, transposed_val); @@ -374,9 +397,11 @@ void add_conv2d_node( /*prepack_weights = */ false, method, weight_data, - clamp_out); + clamp_out, + stride_equals_dilation); - utils::uvec3 wg_size = create_conv2d_global_wg_size(graph, method, out); + utils::uvec3 wg_size = create_conv2d_global_wg_size( + graph, method, out, weight_data, stride_equals_dilation); if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) { wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1}; diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp index 3a8a59166e..7976d0ddee 100644 --- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp +++ b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp @@ -151,7 +151,8 @@ VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) { // Staging buffers are accessed by both the CPU and GPU, so set the // appropriate flags to indicate that the host device will be accessing // the data from this buffer. - alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | + alloc_create_info.flags |= + VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT; alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST; alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py index 85732d7701..d32fa71573 100644 --- a/backends/vulkan/test/op_tests/cases.py +++ b/backends/vulkan/test/op_tests/cases.py @@ -348,6 +348,39 @@ def get_conv_inputs(): [0, 0], 1, ), + ( + (1, 4, 234, 234), + (4, 1, 3, 3), + (4,), + [2, 1], + [1, 1], + [1, 1], + False, + [0, 0], + 4, + ), + ( + (1, 4, 234, 234), + (4, 1, 3, 3), + (4,), + [1, 2], + [1, 1], + [1, 1], + False, + [0, 0], + 4, + ), + ( + (1, 4, 234, 234), + (4, 1, 3, 3), + (4,), + [2, 2], + [1, 1], + [1, 1], + False, + [0, 0], + 4, + ), ] ) return test_suite diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl index ab55d5beea..d26f1a805c 100644 --- a/backends/vulkan/test/op_tests/targets.bzl +++ b/backends/vulkan/test/op_tests/targets.bzl @@ -3,6 +3,44 @@ load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_pt_ops_deps") load("@fbsource//xplat/caffe2:pt_ops.bzl", "pt_operator_library") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +def define_test_targets(test_name, extra_deps = [], src_file = None, is_fbcode = False): + deps_list = [ + "//third-party/googletest:gtest_main", + "//executorch/backends/vulkan:vulkan_graph_runtime", + runtime.external_dep_location("libtorch"), + ] + extra_deps + + src_file_str = src_file if src_file else "{}.cpp".format(test_name) + + runtime.cxx_binary( + name = "{}_bin".format(test_name), + srcs = [ + src_file_str, + ], + compiler_flags = [ + "-Wno-unused-variable", + ], + define_static_target = False, + deps = deps_list, + ) + + runtime.cxx_test( + name = test_name, + srcs = [ + src_file_str, + ], + contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], + fbandroid_additional_loaded_sonames = [ + "torch-code-gen", + "vulkan_graph_runtime", + "vulkan_graph_runtime_shaderlib", + ], + platforms = [ANDROID], + use_instrumentation_test = True, + deps = deps_list, + ) + + def define_common_targets(is_fbcode = False): if is_fbcode: return @@ -82,19 +120,6 @@ def define_common_targets(is_fbcode = False): default_outs = ["."], ) - runtime.cxx_binary( - name = "compute_graph_op_tests_bin", - srcs = [ - ":generated_op_correctness_tests_cpp[op_tests.cpp]", - ], - define_static_target = False, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ], - ) - runtime.cxx_binary( name = "compute_graph_op_benchmarks_bin", srcs = [ @@ -111,135 +136,17 @@ def define_common_targets(is_fbcode = False): ], ) - runtime.cxx_test( - name = "compute_graph_op_tests", - srcs = [ - ":generated_op_correctness_tests_cpp[op_tests.cpp]", - ], - contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], - fbandroid_additional_loaded_sonames = [ - "torch-code-gen", - "vulkan_graph_runtime", - "vulkan_graph_runtime_shaderlib", - ], - platforms = [ANDROID], - use_instrumentation_test = True, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ], + define_test_targets( + "compute_graph_op_tests", + src_file=":generated_op_correctness_tests_cpp[op_tests.cpp]" ) - runtime.cxx_binary( - name = "sdpa_test_bin", - srcs = [ - "sdpa_test.cpp", - ], - compiler_flags = [ - "-Wno-unused-variable", - ], - define_static_target = False, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", - ], - ) - - runtime.cxx_test( - name = "sdpa_test", - srcs = [ - "sdpa_test.cpp", - ], - contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], - fbandroid_additional_loaded_sonames = [ - "torch-code-gen", - "vulkan_graph_runtime", - "vulkan_graph_runtime_shaderlib", - ], - platforms = [ANDROID], - use_instrumentation_test = True, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", - "//executorch/extension/tensor:tensor", - runtime.external_dep_location("libtorch"), - ], - ) - - runtime.cxx_binary( - name = "linear_weight_int4_test_bin", - srcs = [ - "linear_weight_int4_test.cpp", - ], - compiler_flags = [ - "-Wno-unused-variable", - ], - define_static_target = False, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ], - ) - - runtime.cxx_test( - name = "linear_weight_int4_test", - srcs = [ - "linear_weight_int4_test.cpp", - ], - contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], - fbandroid_additional_loaded_sonames = [ - "torch-code-gen", - "vulkan_graph_runtime", - "vulkan_graph_runtime_shaderlib", - ], - platforms = [ANDROID], - use_instrumentation_test = True, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", + define_test_targets( + "sdpa_test", + extra_deps = [ "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", "//executorch/extension/tensor:tensor", - runtime.external_dep_location("libtorch"), - ], - ) - - runtime.cxx_binary( - name = "rotary_embedding_test_bin", - srcs = [ - "rotary_embedding_test.cpp", - ], - compiler_flags = [ - "-Wno-unused-variable", - ], - define_static_target = False, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - runtime.external_dep_location("libtorch"), - ], - ) - - runtime.cxx_test( - name = "rotary_embedding_test", - srcs = [ - "rotary_embedding_test.cpp", - ], - contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], - fbandroid_additional_loaded_sonames = [ - "torch-code-gen", - "vulkan_graph_runtime", - "vulkan_graph_runtime_shaderlib", - ], - platforms = [ANDROID], - use_instrumentation_test = True, - deps = [ - "//third-party/googletest:gtest_main", - "//executorch/backends/vulkan:vulkan_graph_runtime", - "//executorch/extension/tensor:tensor", - runtime.external_dep_location("libtorch"), - ], + ] ) + define_test_targets("linear_weight_int4_test") + define_test_targets("rotary_embedding_test") diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py index 472127ffe2..6f93e66207 100644 --- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py +++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py @@ -667,7 +667,6 @@ def gen_op_check_fn(self) -> str: op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n" op_check_fn_body = "" - op_check_fn_body += self.gen_conditional_skips() op_check_fn_body += self.gen_graph_build_code() op_check_fn_body += self.gen_graph_exec_code() diff --git a/backends/xnnpack/operators/op_squeeze.py b/backends/xnnpack/operators/op_squeeze.py index 8ed5aa36ae..7a21fe9e55 100644 --- a/backends/xnnpack/operators/op_squeeze.py +++ b/backends/xnnpack/operators/op_squeeze.py @@ -16,7 +16,9 @@ XNNStaticReshape, XNode, ) + from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node +from torch.fx.experimental.symbolic_shapes import free_symbols @register_node_visitor @@ -57,7 +59,7 @@ def define_node( num_dynamic_dims = 0 for dim in dynamic_shape: - if isinstance(dim, torch.SymInt): + if free_symbols(dim): num_dynamic_dims += 1 new_shape.append(0) else: @@ -119,7 +121,7 @@ def define_node( num_dynamic_dims = 0 for dim in dynamic_shape: - if isinstance(dim, torch.SymInt): + if free_symbols(dim): num_dynamic_dims += 1 new_shape.append(0) else: diff --git a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md index 5e43a63c76..acfede66e6 100644 --- a/docs/source/build-run-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/build-run-qualcomm-ai-engine-direct-backend.md @@ -59,7 +59,7 @@ This example is verified with SM8550 and SM8450. - Click the "Get Software" button to download a version of QNN SDK. - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6. - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon. - - [QNN 2.26.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip) + - [QNN 2.28.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip) The directory with installed Qualcomm AI Engine Direct SDK looks like: ``` diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md index 133f9ec50b..7ed768baf2 100644 --- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -9,7 +9,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng - Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch. - A Qualcomm device with 16GB RAM - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices. -- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above. +- The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.28.0 or above. ## Instructions diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 2de1e713c9..1208d79b06 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -1,6 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. -# Copyright 2023-2024 Arm Limited and/or its affiliates. +# Copyright 2023-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -263,11 +263,7 @@ def get_compile_spec( ) -> ArmCompileSpecBuilder: spec_builder = None if target == "TOSA": - spec_builder = ( - ArmCompileSpecBuilder() - .tosa_compile_spec("TOSA-0.80+BI") - .set_permute_memory_format(True) - ) + spec_builder = ArmCompileSpecBuilder().tosa_compile_spec("TOSA-0.80+BI") elif "ethos-u55" in target: spec_builder = ( ArmCompileSpecBuilder() @@ -277,7 +273,6 @@ def get_compile_spec( memory_mode=memory_mode, extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", ) - .set_permute_memory_format(True) .set_quantize_io(True) .set_input_order(reorder_inputs) ) @@ -290,7 +285,6 @@ def get_compile_spec( memory_mode=memory_mode, extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", ) - .set_permute_memory_format(True) .set_quantize_io(True) .set_input_order(reorder_inputs) ) diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index bf922360fd..5498bd7897 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -2,7 +2,7 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # -# Copyright 2023-2024 Arm Limited and/or its affiliates. +# Copyright 2023-2025 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -91,8 +91,8 @@ tosa_reference_model_url="https://review.mlplatform.org/tosa/reference_model" tosa_reference_model_rev="v0.80.1" # vela -vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela" -vela_rev="5427dc7e9c1a4c7d554163290faeea75f168772d" +vela_repo_url="https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela" +vela_rev="fc970e3da72e5f6930b840b357684126602b3126" ######## ### Mandatory user args diff --git a/exir/backend/utils.py b/exir/backend/utils.py index 50d1e73fd7..9487c59a84 100644 --- a/exir/backend/utils.py +++ b/exir/backend/utils.py @@ -23,6 +23,7 @@ from executorch.exir.lowered_backend_module import create_submodule_from_nodes from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param +from torch.fx.experimental.symbolic_shapes import has_free_symbols from torch.fx.node import Node from torch.fx.passes.utils.source_matcher_utils import SourcePartition @@ -424,10 +425,7 @@ def is_shape_dynamic(node: torch.fx.Node) -> bool: Check if the node shape is dynamic. """ - # Shape is dynamic if any of the dimensions don't evaluate to a static value - return "val" in node.meta and any( - isinstance(d, torch.SymInt) for d in node.meta["val"].shape - ) + return has_free_symbols(node.meta["val"].shape) # TODO - style: use templated types diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py index 6cd05b4bf6..3ecf0b2b4b 100644 --- a/extension/llm/modules/test/test_attention.py +++ b/extension/llm/modules/test/test_attention.py @@ -33,6 +33,7 @@ def setUp(self): self.num_kv_heads = 8 self.head_dim = 64 self.max_seq_len = 128 + self.encoder_max_seq_len = 128 self.rope_base = 500_000 self.scale_factor = 32 @@ -86,16 +87,26 @@ def setUp(self): max_seq_len=self.max_seq_len, ) self.et_mha.load_state_dict(self.tt_mha.state_dict()) + # Common inputs. seq_len = 10 self.x = torch.randn(1, seq_len, self.embed_dim) + self.y = torch.randn(1, seq_len, self.embed_dim) self.input_pos = torch.arange(seq_len).unsqueeze(0) # shape [1, seq_len] - seq_len_dim = torch.export.Dim("seq_len", min=1, max=100) - self.dynamic_shapes = ( - {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC}, - {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC}, - {0: torch.export.Dim.STATIC, 1: seq_len_dim}, - ) + self.seq_len_dim = torch.export.Dim("seq_len", min=1, max=self.max_seq_len) + self.dynamic_shapes = { + "x": { + 0: torch.export.Dim.STATIC, + 1: self.seq_len_dim, + 2: torch.export.Dim.STATIC, + }, + "y": { + 0: torch.export.Dim.STATIC, + 1: self.seq_len_dim, + 2: torch.export.Dim.STATIC, + }, + "input_pos": {0: torch.export.Dim.STATIC, 1: self.seq_len_dim}, + } self.causal_mask = torch.tril( torch.ones( size=(self.max_seq_len, self.max_seq_len), @@ -110,8 +121,8 @@ def test_attention_eager(self): assert_close(et_res, tt_res) # test with kv cache - self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=20) - self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=20) + self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) et_res = self.et_mha(self.x, self.x) # Self attention. tt_res = self.tt_mha(self.x, self.x) # Self attention. @@ -144,12 +155,12 @@ def test_attention_export(self): # Self attention. # test with kv cache - self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100) - self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100) + self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) with torch.no_grad(): et_mha_ep = torch.export.export( self.et_mha, - (self.x, self.x), + (self.x, self.y), kwargs={"input_pos": self.input_pos}, dynamic_shapes=self.dynamic_shapes, strict=True, @@ -166,8 +177,8 @@ def test_attention_aoti(self): # Self attention. # test with kv cache - self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100) - self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100) + self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) with torch.no_grad(): so = torch._export.aot_compile( self.et_mha, @@ -189,13 +200,13 @@ def test_attention_aoti(self): def test_attention_executorch(self): # Self attention. - self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100) - self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100) + self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) with torch.no_grad(): et_mha_ep = torch.export.export( self.et_mha, - (self.x, self.x), + (self.x, self.y), kwargs={"input_pos": self.input_pos}, dynamic_shapes=self.dynamic_shapes, strict=True, @@ -222,22 +233,18 @@ def test_attention_executorch(self): def test_attention_torch_cond_eager(self): # Different from vanilla torchtune MHA, we rewrite the if condition with torch.cond. We need to make sure they are giving the same results regarding the if condition. - # For the first run of MHA we provide `y` (self.x) but for the second run it will be a tensor full of nan. + # For the first run of MHA we provide `y` but for the second run it will be a tensor full of nan. self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) mask = self.causal_mask[self.input_pos, :] # First run. - et_res = self.et_mha( - self.x, self.x, mask=mask, input_pos=self.input_pos - ) # Self attention with input pos. - tt_res = self.tt_mha( - self.x, self.x, mask=mask, input_pos=self.input_pos - ) # Self attention with input pos. + et_res = self.et_mha(self.x, self.y, mask=mask, input_pos=self.input_pos) + tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos) assert_close(et_res, tt_res) - # Second run test kv cache read. Input pos is [10, 11, ..., 19] + # Second run tests kv cache read. Input pos is [10, 11, ..., 19] next_input_pos = torch.arange(10, 20).unsqueeze(0) empty_y = torch.full_like(self.x, torch.nan) @@ -246,3 +253,101 @@ def test_attention_torch_cond_eager(self): tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos) assert_close(et_res, tt_res) + + def test_attention_torch_cond_export(self): + self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + mask = self.causal_mask[self.input_pos, :] + dynamic_shapes = { + **self.dynamic_shapes, + **{ + "mask": { + 0: torch.export.Dim.STATIC, + 1: self.seq_len_dim, + 2: torch.export.Dim.STATIC, + } + }, + } + with torch.no_grad(): + et_mha_ep = torch.export.export( + self.et_mha, + (self.x, self.y), + kwargs={ + "mask": mask, + "input_pos": self.input_pos, + }, + dynamic_shapes=dynamic_shapes, + strict=True, + ) + + # First run. + et_res = et_mha_ep.module()(self.x, self.y, mask=mask, input_pos=self.input_pos) + tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos) + + assert_close(et_res, tt_res) + + # Second run tests kv cache read. Input pos is [10, 11, ..., 19] + next_input_pos = torch.arange(10, 20).unsqueeze(0) + empty_y = torch.full_like(self.y, torch.nan) + mask = self.causal_mask[next_input_pos, :] + et_res = et_mha_ep.module()( + self.x, empty_y, mask=mask, input_pos=next_input_pos + ) + tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos) + + assert_close(et_res, tt_res) + + def test_attention_torch_cond_executorch(self): + self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len) + mask = self.causal_mask[self.input_pos, :] + dynamic_shapes = { + **self.dynamic_shapes, + **{ + "mask": { + 0: torch.export.Dim.STATIC, + 1: self.seq_len_dim, + 2: torch.export.Dim.STATIC, + } + }, + } + with torch.no_grad(): + et_mha_ep = torch.export.export( + self.et_mha, + (self.x, self.y), + kwargs={ + "mask": mask, + "input_pos": self.input_pos, + }, + dynamic_shapes=dynamic_shapes, + strict=True, + ) + et_program = to_edge( + et_mha_ep, + compile_config=EdgeCompileConfig( + _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg], + _check_ir_validity=False, + ), + ).to_executorch( + config=ExecutorchBackendConfig( + passes=[InitializedMutableBufferPass(["cache_pos"])], + ) + ) + + # First run. + runtime = Runtime.get() + program = runtime.load_program(et_program.buffer) + method = program.load_method("forward") + et_res = method.execute((self.x, self.y, mask, self.input_pos)) + tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos) + + assert_close(et_res[0], tt_res) + + # Second run tests kv cache read. Input pos is [10, 11, ..., 19] + next_input_pos = torch.arange(10, 20).unsqueeze(0) + empty_y = torch.full_like(self.y, torch.nan) + mask = self.causal_mask[next_input_pos, :] + et_res = method.execute((self.x, empty_y, mask, next_input_pos)) + tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos) + + assert_close(et_res[0], tt_res) diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml index 44676f2c23..833b37cfac 100644 --- a/kernels/aten/functions.yaml +++ b/kernels/aten/functions.yaml @@ -257,6 +257,8 @@ - op: mean.out +- op: mean.dtype_out + - op: min.dim_min - op: min.unary_out diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index aeb0d7f8ca..6730404dde 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -66,6 +66,14 @@ Tensor& mean_dim_out( return out; } +Tensor& mean_dtype_out( + KernelRuntimeContext& ctx, + const Tensor& in, + optional dtype, + Tensor& out) { + return mean_dim_out(ctx, in, ArrayRef(), false, dtype, out); +} + } // namespace native } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp index 08237e07b9..884c10b813 100644 --- a/kernels/portable/cpu/util/reduce_util.cpp +++ b/kernels/portable/cpu/util/reduce_util.cpp @@ -386,6 +386,7 @@ bool check_mean_dim_args( check_reduction_args(in, dim_list, keepdim, dtype, out)); if (dtype) { + ET_LOG(Info, "dtype is %hhd", static_cast(dtype.value())); ET_LOG_AND_RETURN_IF_FALSE(torch::executor::isFloatingType(dtype.value())); ET_LOG_AND_RETURN_IF_FALSE(out.scalar_type() == dtype.value()); } else { diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 96382eb497..3221b8fe34 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -577,6 +577,11 @@ - arg_meta: null kernel_name: torch::executor::mean_dim_out +- op: mean.dtype_out + kernels: + - arg_meta: null + kernel_name: torch::executor::mean_dtype_out + - op: min.dim_min kernels: - arg_meta: null diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp index 9821cb6b47..c5ba00b20e 100644 --- a/kernels/test/op_mean_test.cpp +++ b/kernels/test/op_mean_test.cpp @@ -9,7 +9,7 @@ #include // Declares the operator #include #include -#include +#include #include #include #include @@ -22,6 +22,7 @@ using exec_aten::ArrayRef; using exec_aten::optional; using exec_aten::ScalarType; using exec_aten::Tensor; +using executorch::runtime::Error; using torch::executor::testing::TensorFactory; class OpMeanOutTest : public OperatorTest { @@ -36,6 +37,13 @@ class OpMeanOutTest : public OperatorTest { context_, self, dim, keepdim, dtype, out); } + Tensor& op_mean_dtype_out( + const Tensor& self, + optional dtype, + Tensor& out) { + return torch::executor::aten::mean_outf(context_, self, dtype, out); + } + template void test_mean_dim_out_invalid_dimensions() { TensorFactory tf_in; @@ -466,3 +474,68 @@ TEST_F(OpMeanOutTest, DynamicShapeUnbound) { op_mean_out(x, ArrayRef{1}, false, ScalarType::Float, out); EXPECT_TENSOR_CLOSE(out, expected_result); } + +TEST_F(OpMeanOutTest, DTypeOutFloatValid) { + TensorFactory tf; + + Tensor x = tf.make( + {10, 10}, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + Tensor expected_result = tf.make({}, {1.0}); + + Tensor out = tf.zeros({}); + Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +TEST_F(OpMeanOutTest, DTypeOutFloatToBoolInvalid) { + TensorFactory tf; + + Tensor x = tf.make( + {10, 10}, + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + Tensor expected_result = tf.make({}, {1.0}); + + Tensor out = tf.zeros({}); + + ET_EXPECT_KERNEL_FAILURE( + context_, op_mean_dtype_out(x, ScalarType::Bool, out)); +} + +TEST_F(OpMeanOutTest, DTypeOutFloatInfinity) { + TensorFactory tf; + + Tensor x = tf.make({2, 1}, {INFINITY, INFINITY}); + Tensor expected_result = tf.make({}, {INFINITY}); + + Tensor out = tf.zeros({}); + + Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} + +TEST_F(OpMeanOutTest, DTypeOutFloatNAN) { + TensorFactory tf; + + Tensor x = tf.make({2, 1}, {NAN, INFINITY}); + Tensor expected_result = tf.make({}, {NAN}); + + Tensor out = tf.zeros({}); + + Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out); + EXPECT_TENSOR_CLOSE(out, expected_result); +} diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index db9417dd88..674af6d69f 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -313,6 +313,8 @@ Error Method::parse_values() { "Null value at index %zu", i); + const auto val = serialization_value->val(); + switch (serialization_value->val_type()) { case executorch_flatbuffer::KernelTypes::Null: { // Placement new as the list elements are not initialized, so calling @@ -321,18 +323,21 @@ Error Method::parse_values() { new (&values_[i]) EValue(); } break; case executorch_flatbuffer::KernelTypes::Int: { - new (&values_[i]) EValue(serialization_value->val_as_Int()->int_val()); + new (&values_[i]) EValue( + static_cast(val)->int_val()); } break; case executorch_flatbuffer::KernelTypes::Double: { new (&values_[i]) - EValue(serialization_value->val_as_Double()->double_val()); + EValue(static_cast(val) + ->double_val()); } break; case executorch_flatbuffer::KernelTypes::Bool: { - new (&values_[i]) - EValue(serialization_value->val_as_Bool()->bool_val()); + new (&values_[i]) EValue( + static_cast(val)->bool_val()); } break; case executorch_flatbuffer::KernelTypes::IntList: { - const auto items = serialization_value->val_as_IntList()->items(); + const auto items = + static_cast(val)->items(); ET_CHECK_OR_RETURN_ERROR( items != nullptr, InvalidProgram, "Missing list at index %zu", i); // Allocate space for boxed and unboxed list representations using @@ -352,7 +357,8 @@ Error Method::parse_values() { BoxedEvalueList(evalp_list, int_list, items->size())); } break; case executorch_flatbuffer::KernelTypes::BoolList: { - const auto items = serialization_value->val_as_BoolList()->items(); + const auto items = + static_cast(val)->items(); ET_CHECK_OR_RETURN_ERROR( items != nullptr, InvalidProgram, "Missing list at index %zu", i); // NOTE: This is technically not portable. A platform could technically @@ -366,14 +372,17 @@ Error Method::parse_values() { (const bool*)items->data(), items->size())); } break; case executorch_flatbuffer::KernelTypes::DoubleList: { - const auto items = serialization_value->val_as_DoubleList()->items(); + const auto items = + static_cast(val)->items(); ET_CHECK_OR_RETURN_ERROR( items != nullptr, InvalidProgram, "Missing list at index %zu", i); new (&values_[i]) EValue(exec_aten::ArrayRef(items->data(), items->size())); } break; case executorch_flatbuffer::KernelTypes::String: { - const auto fb_str = serialization_value->val_as_String()->string_val(); + const auto fb_str = + static_cast(val) + ->string_val(); ET_CHECK_OR_RETURN_ERROR( fb_str != nullptr, InvalidProgram, @@ -383,7 +392,9 @@ Error Method::parse_values() { } break; case executorch_flatbuffer::KernelTypes::Tensor: { auto t = deserialization::parseTensor( - program_, memory_manager_, serialization_value->val_as_Tensor()); + program_, + memory_manager_, + static_cast(val)); if (!t.ok()) { ET_LOG( Error, @@ -398,7 +409,7 @@ Error Method::parse_values() { // get list of serialization tensors and allocate storage for executor // tensors auto tensors = deserialization::parseTensorList( - serialization_value->val_as_TensorList()->items(), + static_cast(val)->items(), values_, memory_manager_); if (!tensors.ok()) { @@ -415,7 +426,9 @@ Error Method::parse_values() { // Same as TensorList but optional instead of Tensor auto tensors = deserialization::parseListOptionalType( - serialization_value->val_as_OptionalTensorList()->items(), + static_cast( + val) + ->items(), values_, memory_manager_); if (!tensors.ok()) { diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp index f43398d0ab..5be486b4d8 100644 --- a/runtime/executor/method_meta.cpp +++ b/runtime/executor/method_meta.cpp @@ -116,6 +116,14 @@ Result MethodMeta::input_tag(size_t index) const { index, num_inputs); auto input_index = s_plan_->inputs()->Get(index); + size_t num_values = s_plan_->values()->size(); + ET_CHECK_OR_RETURN_ERROR( + input_index >= 0 && input_index < num_values, + InvalidProgram, + "internal value index %d out of range [0,%zu) for input %zu", + input_index, + num_values, + index); auto serialization_value = s_plan_->values()->Get(input_index); return get_tag(serialization_value, index); } @@ -132,6 +140,7 @@ Result MethodMeta::input_tensor_meta(size_t index) const { (size_t)tag.get(), index); auto input_index = s_plan_->inputs()->Get(index); + // input_index was already validated by input_tag(). auto tensor_value = s_plan_->values()->Get(input_index)->val_as_Tensor(); return TensorInfo( Span( @@ -156,8 +165,16 @@ Result MethodMeta::output_tag(size_t index) const { "index %zu out of range. num_outputs: %zu", index, num_outputs); - auto input_index = s_plan_->outputs()->Get(index); - auto serialization_value = s_plan_->values()->Get(input_index); + auto output_index = s_plan_->outputs()->Get(index); + size_t num_values = s_plan_->values()->size(); + ET_CHECK_OR_RETURN_ERROR( + output_index >= 0 && output_index < num_values, + InvalidProgram, + "internal value index %d out of range [0,%zu) for output %zu", + output_index, + num_values, + index); + auto serialization_value = s_plan_->values()->Get(output_index); return get_tag(serialization_value, index); } @@ -173,6 +190,7 @@ Result MethodMeta::output_tensor_meta(size_t index) const { (size_t)tag.get(), index); auto output_index = s_plan_->outputs()->Get(index); + // output_index was already validated by output_tag(). auto tensor_value = s_plan_->values()->Get(output_index)->val_as_Tensor(); return TensorInfo( diff --git a/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl b/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl index 75019982af..5cb801489e 100644 --- a/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl +++ b/shim/xplat/executorch/backends/qualcomm/qnn_version.bzl @@ -1,2 +1,2 @@ def get_qnn_library_verision(): - return "2.26" + return "2.28" diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl index 46cdaebcb3..8e0e89eda5 100644 --- a/shim/xplat/executorch/codegen/codegen.bzl +++ b/shim/xplat/executorch/codegen/codegen.bzl @@ -397,11 +397,11 @@ def build_portable_lib(name, oplist_header_name, feature = None, expose_operator # Currently fbcode links all dependent libraries through shared # library, and it blocks users like unit tests to use kernel # implementation directly. So we enable this for xplat only. - compiler_flags = ["-Wno-missing-prototypes", "-fvisibility=hidden"] - if expose_operator_symbols: + compiler_flags = ["-Wno-missing-prototypes"] + if not expose_operator_symbols: # Removing '-fvisibility=hidden' exposes operator symbols. # This allows operators to be called outside of the kernel registry. - compiler_flags = ["-Wno-missing-prototypes"] + compiler_flags += ["-fvisibility=hidden"] # Build portable lib. runtime.cxx_library( diff --git a/test/size_test.cpp b/test/size_test.cpp index 88b605c3bf..1fab1e914e 100644 --- a/test/size_test.cpp +++ b/test/size_test.cpp @@ -94,7 +94,7 @@ int main(int argc, char** argv) { // It assumes the outputs are all tensors. for (size_t i = 0; i < method->outputs_size(); i++) { auto output_tensor = output_list[i].toTensor(); - auto data_output = output_tensor.const_data_ptr(); + [[maybe_unused]] auto data_output = output_tensor.const_data_ptr(); for (size_t j = 0; j < output_list[i].toTensor().numel(); ++j) { ET_LOG(Info, "%f", data_output[j]); }