Skip to content

Commit

Permalink
Update on "Remove ExecuTorch copy of Vectorized"
Browse files Browse the repository at this point in the history
All uses are outside ExecuTorch core, so we can just use ATen Vectorized.

Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/)

[ghstack-poisoned]
  • Loading branch information
swolchok committed Jan 17, 2025
2 parents 0455b2b + 0e3ebb4 commit 10508a5
Show file tree
Hide file tree
Showing 95 changed files with 2,577 additions and 1,275 deletions.
1 change: 1 addition & 0 deletions .github/pytorch-probot.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml
tracking_issue: 7679
ciflow_push_tags:
- ciflow/android
- ciflow/apple
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ jobs:
--output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
export PYTHONPATH=$(pwd)/..
Expand Down Expand Up @@ -347,7 +347,7 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
export ANDROID_ABIS="arm64-v8a"
PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
# Let's see how expensive this job is, we might want to tone it down by running it periodically
benchmark-on-device:
Expand Down
22 changes: 22 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ jobs:

unittest-arm:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -394,6 +397,25 @@ jobs:
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
test-qnn-models-linux:
name: test-qnn-models-linux
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
strategy:
fail-fast: false
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-qnn-sdk
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 180
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# placeholder for running test_qnn_delegate.py, can use matrix such that we can trigger different jobs, refers to test-llama-runner-qnn-linux
# reminder: make sure each job runs fast
test-phi-3-mini-runner-linux:
name: test-phi-3-mini-runner-linux
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ jobs:
test-arm-backend-delegation:
name: test-arm-backend-delegation
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -159,6 +162,9 @@ jobs:
test-arm-reference-delegation:
name: test-arm-reference-delegation
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
permissions:
id-token: write
contents: read
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down
1 change: 1 addition & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ include_patterns = [
'build/**/*.py',
'codegen/**/*.py',
# 'devtools/**/*.py',
'devtools/visualization/**/*.py',
'docs/**/*.py',
# 'examples/**/*.py',
# 'exir/**/*.py',
Expand Down
6 changes: 3 additions & 3 deletions backends/apple/coreml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,14 @@ class Model(torch.nn.Module):
source_model = Model()
example_inputs = (torch.randn((1, 3, 256, 256)), )

pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()

quantization_config = LinearQuantizerConfig.from_dict(
{
"global_config": {
"quantization_scheme": QuantizationScheme.symmetric,
"activation_dtype": torch.uint8,
"weight_dtype": torch.int8,
"activation_dtype": torch.quint8,
"weight_dtype": torch.qint8,
"weight_per_channel": True,
}
}
Expand Down
115 changes: 63 additions & 52 deletions backends/arm/_passes/arm_pass_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

# pyre-unsafe

import torch
from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import (
AnnotateChannelsLastDimOrder,
)
Expand Down Expand Up @@ -47,7 +46,7 @@
)
from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
ConvertMeanDimToAveragePool,
ConvertMeanDimToAveragePoolPass,
)
from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass
from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
Expand All @@ -61,86 +60,98 @@
from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
UnsqueezeScalarPlaceholdersPass,
)
from executorch.backends.arm.tosa_specification import TosaSpecification
from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
from executorch.exir import ExportedProgram
from executorch.exir.dialects._ops import ops as exir_ops
from executorch.exir.pass_manager import PassManager
from torch.fx import GraphModule


class ArmPassManager(PassManager):

def _transform(self, graph_module: torch.fx.GraphModule):
def __init__(self, tosa_spec: TosaSpecification) -> None:
self.tosa_spec = tosa_spec
super().__init__()

def _transform(self, graph_module: GraphModule):
return self(graph_module).graph_module

def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
"""Apply passes before transforming program to backend"""
def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
self.add_pass(FuseQuantizedActivationPass())
self.add_pass(RemoveGetItemPass())
self.add_pass(ConvertSplitToSlicePass())
self.add_pass(ConvertMmToBmmPass())
self.add_pass(DecomposeLinearPass())
self.add_pass(ConvertMeanDimToAveragePoolPass())

self.add_pass(AnnotateDecomposedMatmulPass())
self.add_pass(QuantizeFullArgument())
self.add_pass(FoldAndAnnotateQParamsPass())
self.add_pass(RetraceFoldedDtypesPass())
self.add_pass(InsertTableOpsPass(exported_program))

self.add_pass(RemoveClonePass())
self.add_pass(SizeAdjustConv2DPass())
self.add_pass(ConvertExpandCopyToRepeatPass())
self.add_pass(UnsqueezeBeforeRepeatPass())
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
self.add_pass(CastInt64ToInt32Pass(exported_program))
self.add_pass(MatchArgRanksPass(exported_program))
self.add_pass(KeepDimsFalseToSqueezePass())
self.add_pass(Conv1dUnsqueezePass(exported_program))
self.add_pass(DecomposeSelectPass())

self.add_pass(AnnotateChannelsLastDimOrder())

return self._transform(exported_program.graph_module)

def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:

self.add_pass(FuseQuantizedActivationPass())
self.add_pass(RemoveGetItemPass())
self.add_pass(ConvertSplitToSlicePass())
self.add_pass(ConvertMmToBmmPass())
self.add_pass(DecomposeLinearPass())
self.add_pass(DecomposeLayerNormPass())
self.add_pass(DecomposeVarPass())
self.add_pass(ConvertMeanDimToAveragePool())
self.add_pass(DecomposeMeanDimPass())
self.add_pass(ConvertSplitToSlicePass())
self.add_pass(ConvertMmToBmmPass())
# TODO MLETORCH-558
self.add_pass(ConvertMeanDimToAveragePoolPass())
self.add_pass(DecomposeDivPass())
self.add_pass(DecomposeSoftmaxesPass())

self.add_pass(AnnotateDecomposedMatmulPass())
self.add_pass(QuantizeFullArgument())
self.add_pass(
FoldAndAnnotateQParamsPass(
[
exir_ops.edge.aten.minimum.default,
exir_ops.edge.aten.maximum.default,
exir_ops.edge.aten.add.Tensor,
exir_ops.edge.aten.avg_pool2d.default,
exir_ops.edge.aten.bmm.default,
exir_ops.edge.aten.cat.default,
exir_ops.edge.aten.convolution.default,
exir_ops.edge.aten.clone.default,
exir_ops.edge.aten.exp.default,
exir_ops.edge.aten.expand_copy.default,
exir_ops.edge.aten.full.default,
exir_ops.edge.aten.hardtanh.default,
exir_ops.edge.aten.log.default,
exir_ops.edge.aten.max_pool2d.default,
exir_ops.edge.aten.mul.Tensor,
exir_ops.edge.aten.permute_copy.default,
exir_ops.edge.aten.reciprocal.default,
exir_ops.edge.aten.relu.default,
exir_ops.edge.aten.repeat.default,
exir_ops.edge.aten.rsqrt.default,
exir_ops.edge.aten.select_copy.int,
exir_ops.edge.aten.sigmoid.default,
exir_ops.edge.aten.slice_copy.Tensor,
exir_ops.edge.aten.squeeze_copy.dims,
exir_ops.edge.aten.sub.Tensor,
exir_ops.edge.aten.sum.dim_IntList,
exir_ops.edge.aten.tanh.default,
exir_ops.edge.aten.unsqueeze_copy.default,
exir_ops.edge.aten.upsample_nearest2d.vec,
exir_ops.edge.aten.view_copy.default,
]
)
)
self.add_pass(FoldAndAnnotateQParamsPass())
self.add_pass(RetraceFoldedDtypesPass())
self.add_pass(InsertTableOpsPass(exported_program))

self.add_pass(RemoveClonePass())
self.add_pass(SizeAdjustConv2DPass())
self.add_pass(ConvertExpandCopyToRepeatPass())
self.add_pass(UnsqueezeBeforeRepeatPass())
self.add_pass(CastInt64ToInt32Pass(exported_program))
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
self.add_pass(SizeAdjustConv2DPass())
self.add_pass(RemoveClonePass())
self.add_pass(CastInt64ToInt32Pass(exported_program))
self.add_pass(MatchArgRanksPass(exported_program))
self.add_pass(DecomposeDivPass())
self.add_pass(KeepDimsFalseToSqueezePass())
self.add_pass(Conv1dUnsqueezePass(exported_program))
self.add_pass(DecomposeSoftmaxesPass())
self.add_pass(DecomposeSelectPass())

self.add_pass(AnnotateChannelsLastDimOrder())

return self._transform(exported_program.graph_module)

def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
"""Apply passes before transforming program to backend"""
if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"):
return self._tosa_080_BI_pipeline(exported_program)
elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"):
return self._tosa_080_MI_pipeline(exported_program)
else:
raise NotImplementedError(
f"No pass pipeline implemented for {self.tosa_spec=}"
)

def transform_for_annotation_pipeline(self, graph_module: GraphModule):
self.add_pass(ScalarsToAttributePass())
self.add_pass(DecomposeLayerNormPass())
self.add_pass(DecomposeVarPass())
Expand Down
6 changes: 5 additions & 1 deletion backends/arm/_passes/cast_int64_pass.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Arm Limited and/or its affiliates.
# Copyright 2024-2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
Expand All @@ -17,6 +17,10 @@


class CastInt64ToInt32Pass(ExportPass):
"""
Cast int64 buffers to int32 if the int64 data is in int32 range.
"""

def __init__(self, exported_program: torch.export.ExportedProgram):
super(CastInt64ToInt32Pass, self).__init__()
self.exported_program = exported_program
Expand Down
13 changes: 6 additions & 7 deletions backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Copyright 2024 Arm Limited and/or its affiliates.
# Copyright 2024-2025 Arm Limited and/or its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import copy

from typing import cast, Dict, Iterable, Set, Tuple
from typing import cast, Dict, Set, Tuple

from executorch.backends.arm.tosa_quant_utils import QuantArgs

Expand Down Expand Up @@ -55,7 +55,7 @@ def get_output_qparams(node: Node) -> dict[int, QuantArgs]:
class FoldAndAnnotateQParamsPass(ExportPass):
"""
A pass that walks the graph and removes any DQ and Q nodes before and after the target
node in the supplied list of operators.
node.
The quantization parameters from the DQ/Q nodes are stored as meta values to be
accessible for later lowering and serialization passes.
The assumption is that the quantization annotatation adds DQ nodes for all tensor
Expand All @@ -82,9 +82,8 @@ class FoldAndAnnotateQParamsPass(ExportPass):
"""

def __init__(self, targeted_ops: Iterable[EdgeOpOverload]) -> None:
def __init__(self) -> None:
super().__init__()
self.targeted_ops = targeted_ops

def fold_and_annotate_arg(
self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
Expand Down Expand Up @@ -131,7 +130,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
# Loop over the graph nodes and find any node in the 'targeted_ops' list.
for n in graph_module.graph.nodes:
n = cast(Node, n)
if n.op != "call_function" or n.target not in self.targeted_ops:
if n.op != "call_function":
continue

# Make sure we haven't already set qparams meta information on the node
Expand Down Expand Up @@ -180,7 +179,7 @@ class QuantizeFullArgument(ExportPass):

def call(self, graph_module: GraphModule) -> PassResult:
modified = False
# Loop over the graph nodes and find any node in the 'targeted_ops' list.
# Loop over the graph nodes and find full.default nodes.
for n in graph_module.graph.nodes:
n = cast(Node, n)
if n.target != exir_ops.edge.aten.full.default:
Expand Down
7 changes: 4 additions & 3 deletions backends/arm/_passes/fuse_quantized_activation_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ def _is_fuseable_quantized_activation(self, node: Node):
is_fuseable = min_val == 0

is_quantized = len(node.users) == 1 and next(iter(node.users)).target == q_op
if is_quantized:
if is_fuseable and is_quantized:
quant_node = next(iter(node.users))
zp = quant_node.args[2]
qmin = quant_node.args[3]

return is_fuseable and is_quantized and zp == qmin
return zp == qmin
else:
return False

def _is_fuseable_input(self, node: Node):
return (
Expand Down
4 changes: 2 additions & 2 deletions backends/arm/_passes/meandim_to_averagepool_pass.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Arm Limited and/or its affiliates.
# Copyright 2024-2025 Arm Limited and/or its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
Expand All @@ -16,7 +16,7 @@
Argument = Any


class ConvertMeanDimToAveragePool(ExportPass):
class ConvertMeanDimToAveragePoolPass(ExportPass):
"""
Replace a mean operation with dim = [-1, -2] and keep_dim = True with an average pool operation.
"""
Expand Down
3 changes: 2 additions & 1 deletion backends/arm/_passes/remove_clone_pass.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024 Arm Limited and/or its affiliates.
# Copyright 2024-2025 Arm Limited and/or its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
Expand All @@ -11,6 +11,7 @@


class RemoveClonePass(ExportPass):
"""Remove all clones from graph_module"""

def call_operator(self, op, args, kwargs, meta):
if op != exir_ops.edge.aten.clone.default:
Expand Down
Loading

0 comments on commit 10508a5

Please sign in to comment.