Update base for Update on "Remove ExecuTorch copy of Vectorized"

All uses are outside ExecuTorch core, so we can just use ATen Vectorized. Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/) [ghstack-poisoned]
pytorch · Jan 17, 2025 · 0e3ebb4 · 0e3ebb4
2 parents 84fc8fb + 8494b90
commit 0e3ebb4
Show file tree

Hide file tree

Showing 95 changed files with 2,577 additions and 1,275 deletions.
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -1,4 +1,5 @@
 # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml
+tracking_issue: 7679
 ciflow_push_tags:
 - ciflow/android
 - ciflow/apple

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -260,7 +260,7 @@ jobs:
                       --output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
-                    export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+                    export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                     export PYTHONPATH=$(pwd)/..
 
@@ -347,7 +347,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         export ANDROID_ABIS="arm64-v8a"
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -332,6 +332,9 @@ jobs:
 
   unittest-arm:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -394,6 +397,25 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
+  test-qnn-models-linux:
+    name: test-qnn-models-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 180
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # placeholder for running test_qnn_delegate.py, can use matrix such that we can trigger different jobs, refers to test-llama-runner-qnn-linux
+        # reminder: make sure each job runs fast
+
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -132,6 +132,9 @@ jobs:
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -159,6 +162,9 @@ jobs:
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -298,6 +298,7 @@ include_patterns = [
     'build/**/*.py',
     'codegen/**/*.py',
     # 'devtools/**/*.py',
+    'devtools/visualization/**/*.py',
     'docs/**/*.py',
     # 'examples/**/*.py',
     # 'exir/**/*.py',

diff --git a/backends/apple/coreml/README.md b/backends/apple/coreml/README.md
@@ -93,14 +93,14 @@ class Model(torch.nn.Module):
 source_model = Model()
 example_inputs = (torch.randn((1, 3, 256, 256)), )
 
-pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
+pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
 
 quantization_config = LinearQuantizerConfig.from_dict(
     {
         "global_config": {
             "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.uint8,
-            "weight_dtype": torch.int8,
+            "activation_dtype": torch.quint8,
+            "weight_dtype": torch.qint8,
             "weight_per_channel": True,
         }
     }

diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -7,7 +7,6 @@
 
 # pyre-unsafe
 
-import torch
 from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import (
     AnnotateChannelsLastDimOrder,
 )
@@ -47,7 +46,7 @@
 )
 from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
-    ConvertMeanDimToAveragePool,
+    ConvertMeanDimToAveragePoolPass,
 )
 from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass
 from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
@@ -61,86 +60,98 @@
 from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
     UnsqueezeScalarPlaceholdersPass,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_manager import PassManager
+from torch.fx import GraphModule
 
 
 class ArmPassManager(PassManager):
 
-    def _transform(self, graph_module: torch.fx.GraphModule):
+    def __init__(self, tosa_spec: TosaSpecification) -> None:
+        self.tosa_spec = tosa_spec
+        super().__init__()
+
+    def _transform(self, graph_module: GraphModule):
         return self(graph_module).graph_module
 
-    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
-        """Apply passes before transforming program to backend"""
+    def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(FuseQuantizedActivationPass())
+        self.add_pass(RemoveGetItemPass())
+        self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(ConvertMeanDimToAveragePoolPass())
+
+        self.add_pass(AnnotateDecomposedMatmulPass())
+        self.add_pass(QuantizeFullArgument())
+        self.add_pass(FoldAndAnnotateQParamsPass())
+        self.add_pass(RetraceFoldedDtypesPass())
+        self.add_pass(InsertTableOpsPass(exported_program))
+
+        self.add_pass(RemoveClonePass())
+        self.add_pass(SizeAdjustConv2DPass())
+        self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(UnsqueezeBeforeRepeatPass())
+        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(Conv1dUnsqueezePass(exported_program))
+        self.add_pass(DecomposeSelectPass())
+
+        self.add_pass(AnnotateChannelsLastDimOrder())
+
+        return self._transform(exported_program.graph_module)
+
+    def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+
+        self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
+        self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
+        self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
-        self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeMeanDimPass())
-        self.add_pass(ConvertSplitToSlicePass())
-        self.add_pass(ConvertMmToBmmPass())
-        # TODO MLETORCH-558
+        self.add_pass(ConvertMeanDimToAveragePoolPass())
+        self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeSoftmaxesPass())
+
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeFullArgument())
-        self.add_pass(
-            FoldAndAnnotateQParamsPass(
-                [
-                    exir_ops.edge.aten.minimum.default,
-                    exir_ops.edge.aten.maximum.default,
-                    exir_ops.edge.aten.add.Tensor,
-                    exir_ops.edge.aten.avg_pool2d.default,
-                    exir_ops.edge.aten.bmm.default,
-                    exir_ops.edge.aten.cat.default,
-                    exir_ops.edge.aten.convolution.default,
-                    exir_ops.edge.aten.clone.default,
-                    exir_ops.edge.aten.exp.default,
-                    exir_ops.edge.aten.expand_copy.default,
-                    exir_ops.edge.aten.full.default,
-                    exir_ops.edge.aten.hardtanh.default,
-                    exir_ops.edge.aten.log.default,
-                    exir_ops.edge.aten.max_pool2d.default,
-                    exir_ops.edge.aten.mul.Tensor,
-                    exir_ops.edge.aten.permute_copy.default,
-                    exir_ops.edge.aten.reciprocal.default,
-                    exir_ops.edge.aten.relu.default,
-                    exir_ops.edge.aten.repeat.default,
-                    exir_ops.edge.aten.rsqrt.default,
-                    exir_ops.edge.aten.select_copy.int,
-                    exir_ops.edge.aten.sigmoid.default,
-                    exir_ops.edge.aten.slice_copy.Tensor,
-                    exir_ops.edge.aten.squeeze_copy.dims,
-                    exir_ops.edge.aten.sub.Tensor,
-                    exir_ops.edge.aten.sum.dim_IntList,
-                    exir_ops.edge.aten.tanh.default,
-                    exir_ops.edge.aten.unsqueeze_copy.default,
-                    exir_ops.edge.aten.upsample_nearest2d.vec,
-                    exir_ops.edge.aten.view_copy.default,
-                ]
-            )
-        )
+        self.add_pass(FoldAndAnnotateQParamsPass())
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
+
+        self.add_pass(RemoveClonePass())
+        self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(CastInt64ToInt32Pass(exported_program))
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
-        self.add_pass(SizeAdjustConv2DPass())
-        self.add_pass(RemoveClonePass())
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
-        self.add_pass(DecomposeDivPass())
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
-        self.add_pass(DecomposeSoftmaxesPass())
         self.add_pass(DecomposeSelectPass())
+
         self.add_pass(AnnotateChannelsLastDimOrder())
 
         return self._transform(exported_program.graph_module)
 
-    def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
+        """Apply passes before transforming program to backend"""
+        if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"):
+            return self._tosa_080_BI_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"):
+            return self._tosa_080_MI_pipeline(exported_program)
+        else:
+            raise NotImplementedError(
+                f"No pass pipeline implemented for {self.tosa_spec=}"
+            )
+
+    def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())

diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -17,6 +17,10 @@
 
 
 class CastInt64ToInt32Pass(ExportPass):
+    """
+    Cast int64 buffers to int32 if the int64 data is in int32 range.
+    """
+
     def __init__(self, exported_program: torch.export.ExportedProgram):
         super(CastInt64ToInt32Pass, self).__init__()
         self.exported_program = exported_program

diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -1,12 +1,12 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 import copy
 
-from typing import cast, Dict, Iterable, Set, Tuple
+from typing import cast, Dict, Set, Tuple
 
 from executorch.backends.arm.tosa_quant_utils import QuantArgs
 
@@ -55,7 +55,7 @@ def get_output_qparams(node: Node) -> dict[int, QuantArgs]:
 class FoldAndAnnotateQParamsPass(ExportPass):
     """
     A pass that walks the graph and removes any DQ and Q nodes before and after the target
-     node in the supplied list of operators.
+     node.
      The quantization parameters from the DQ/Q nodes are stored as meta values to be
      accessible for later lowering and serialization passes.
      The assumption is that the quantization annotatation adds DQ nodes for all tensor
@@ -82,9 +82,8 @@ class FoldAndAnnotateQParamsPass(ExportPass):
 
     """
 
-    def __init__(self, targeted_ops: Iterable[EdgeOpOverload]) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.targeted_ops = targeted_ops
 
     def fold_and_annotate_arg(
         self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
@@ -131,7 +130,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
         # Loop over the graph nodes and find any node in the 'targeted_ops' list.
         for n in graph_module.graph.nodes:
             n = cast(Node, n)
-            if n.op != "call_function" or n.target not in self.targeted_ops:
+            if n.op != "call_function":
                 continue
 
             # Make sure we haven't already set qparams meta information on the node
@@ -180,7 +179,7 @@ class QuantizeFullArgument(ExportPass):
 
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
-        # Loop over the graph nodes and find any node in the 'targeted_ops' list.
+        # Loop over the graph nodes and find full.default nodes.
         for n in graph_module.graph.nodes:
             n = cast(Node, n)
             if n.target != exir_ops.edge.aten.full.default:

diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -19,12 +19,13 @@ def _is_fuseable_quantized_activation(self, node: Node):
             is_fuseable = min_val == 0
 
         is_quantized = len(node.users) == 1 and next(iter(node.users)).target == q_op
-        if is_quantized:
+        if is_fuseable and is_quantized:
             quant_node = next(iter(node.users))
             zp = quant_node.args[2]
             qmin = quant_node.args[3]
-
-        return is_fuseable and is_quantized and zp == qmin
+            return zp == qmin
+        else:
+            return False
 
     def _is_fuseable_input(self, node: Node):
         return (

diff --git a/backends/arm/_passes/meandim_to_averagepool_pass.py b/backends/arm/_passes/meandim_to_averagepool_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -16,7 +16,7 @@
 Argument = Any
 
 
-class ConvertMeanDimToAveragePool(ExportPass):
+class ConvertMeanDimToAveragePoolPass(ExportPass):
     """
     Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation.
     """

diff --git a/backends/arm/_passes/remove_clone_pass.py b/backends/arm/_passes/remove_clone_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -11,6 +11,7 @@
 
 
 class RemoveClonePass(ExportPass):
+    """Remove all clones from graph_module"""
 
     def call_operator(self, op, args, kwargs, meta):
         if op != exir_ops.edge.aten.clone.default: