Skip to content

Commit

Permalink
Remove ExecuTorch copy of Vectorized
Browse files Browse the repository at this point in the history
Pull Request resolved: #7042

All uses are outside ExecuTorch core, so we can just use ATen Vectorized.
ghstack-source-id: 261549440
@exported-using-ghexport

Differential Revision: [D66396016](https://our.internmc.facebook.com/intern/diff/D66396016/)
  • Loading branch information
swolchok committed Jan 15, 2025
1 parent 8674880 commit 469c132
Show file tree
Hide file tree
Showing 27 changed files with 132 additions and 5,149 deletions.
6 changes: 4 additions & 2 deletions extension/llm/custom_ops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,9 @@ target_compile_options(
install(TARGETS custom_ops DESTINATION lib)

if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
# Add a AOT library
find_package(Torch CONFIG REQUIRED)
# Use NO_CMAKE_FIND_ROOT_PATH to make sure this works even if cross-compiling.
# See note in kernels/optimized/CMakeLists.txt.
find_package(Torch CONFIG REQUIRED NO_CMAKE_FIND_ROOT_PATH)
add_library(
custom_ops_aot_lib SHARED
${_custom_ops__srcs}
Expand All @@ -83,6 +84,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
)
target_include_directories(
custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
${TORCH_INCLUDE_DIRS}
)
# TODO: This only works if we install portable_lib.so to
# <site-packages>/executorch/extension/pybindings/.
Expand Down
15 changes: 4 additions & 11 deletions extension/llm/custom_ops/op_sdpa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

#include <executorch/extension/llm/custom_ops/op_sdpa.h>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/blas/CPUBlas.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
// @lint-ignore CLANGTIDY facebook-unused-include-check
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
Expand All @@ -34,18 +35,10 @@ namespace util {
constexpr size_t kKVDim = 4;

template <typename T>
inline void _store(T* dst, ::executorch::vec::Vectorized<T> src) {
inline void _store(T* dst, ::at::vec::Vectorized<T> src) {
src.store(dst);
}

/*
inline void _store(::Half* dst, at::vec::Vectorized<float> src) {
//fp16_ieee_to_fp32_value
auto res = at::vec::convert_float_half(src, src);
res.store(dst, at::vec::Vectorized<float>::size());
}
*/

template <typename T>
inline T data_index_init(T offset) {
return offset;
Expand Down Expand Up @@ -78,7 +71,7 @@ inline double calculate_scale(const Tensor& query, optional<double> scale) {
}

} // namespace util
namespace vec = ::executorch::vec;
namespace vec = ::at::vec;
using Tensor = exec_aten::Tensor;

namespace {
Expand Down
28 changes: 14 additions & 14 deletions kernels/optimized/cpu/moments_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// for use in optimized ExecuTorch ops. Template specializations of BFloat16
// are excluded.

#include <executorch/kernels/optimized/vec/vec.h>
#include <ATen/cpu/vec/vec.h>

#include <executorch/kernels/optimized/utils/math_utils.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -47,12 +47,12 @@ void AddMoments(
template <typename T>
ET_INLINE void AddMomentsVec(
int64_t m0_add,
const executorch::vec::Vectorized<T>& m1_add,
const executorch::vec::Vectorized<T>& m2_add,
const at::vec::Vectorized<T>& m1_add,
const at::vec::Vectorized<T>& m2_add,
int64_t& m0,
executorch::vec::Vectorized<T>& m1,
executorch::vec::Vectorized<T>& m2) {
using Vec = executorch::vec::Vectorized<T>;
at::vec::Vectorized<T>& m1,
at::vec::Vectorized<T>& m2) {
using Vec = at::vec::Vectorized<T>;
const int64_t n = m0 + m0_add;
const T c =
n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
Expand All @@ -67,11 +67,11 @@ template <typename T>
inline void UpdateMomentsVec(
int64_t m0,
const T* X_ptr,
const std::array<executorch::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
const std::array<at::vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
int64_t& m0_stk0,
executorch::vec::Vectorized<acc_t<T>>& m1_stk0,
executorch::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = executorch::vec::Vectorized<acc_t<T>>;
at::vec::Vectorized<acc_t<T>>& m1_stk0,
at::vec::Vectorized<acc_t<T>>& m2_stk0) {
using Vec = at::vec::Vectorized<acc_t<T>>;
Vec m1_vec(0);
Vec m2_vec(0);
for (int64_t j = 0; j < m0; ++j) {
Expand All @@ -92,13 +92,13 @@ std::pair<acc_t<T>, acc_t<T>>
RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
using T_ACC = acc_t<T>;

constexpr int64_t kVecSize = executorch::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = executorch::vec::Vectorized<T_ACC>::size();
constexpr int64_t kVecSize = at::vec::Vectorized<T>::size();
constexpr int64_t kAccVecSize = at::vec::Vectorized<T_ACC>::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
const int64_t depth = executorch::utils::CeilLog2(m);

using Vec = executorch::vec::Vectorized<T_ACC>;
using Vec = at::vec::Vectorized<T_ACC>;
const Vec kZeroVec(T_ACC(0));
std::array<int64_t, kMaxDepth> m0_stk;
std::array<Vec, kMaxDepth> m1_stk;
Expand Down Expand Up @@ -168,7 +168,7 @@ RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
template <typename T>
std::pair<acc_t<T>, acc_t<T>>
RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
using Vec = executorch::vec::Vectorized<T>;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t kVecSize = Vec::size();
const int64_t n = N / kVecSize;
const int64_t m = executorch::utils::divup(n, kChunkSize);
Expand Down
17 changes: 9 additions & 8 deletions kernels/optimized/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -99,8 +100,8 @@ Tensor& opt_add_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down Expand Up @@ -131,8 +132,8 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -166,7 +167,7 @@ Tensor& opt_add_out(
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -244,8 +245,8 @@ Tensor& opt_add_scalar_out(
CTYPE alpha_val;
ET_EXTRACT_SCALAR(alpha, alpha_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_div.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
Expand Down Expand Up @@ -76,16 +77,16 @@ Tensor& opt_div_out(
CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (a.numel() == 1) {
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
out.mutable_data_ptr<CTYPE>(),
tensor->const_data_ptr<CTYPE>(),
out.numel());
} else {
Vec inv_scalar_casted_vec(CTYPE(1) / scalar_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_scalar_casted_vec](Vec x) {
return x * inv_scalar_casted_vec;
},
Expand All @@ -111,8 +112,8 @@ Tensor& opt_div_out(
"Failed to resize output tensor.");

ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "div.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x / y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -142,7 +143,7 @@ Tensor& opt_div_out(
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
Expand Down Expand Up @@ -222,9 +223,9 @@ Tensor& opt_div_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
Vec inv_b_casted_vec(CTYPE(1) / b_casted);
executorch::vec::map<CTYPE>(
at::vec::map<CTYPE>(
[inv_b_casted_vec](Vec x) { return x * inv_b_casted_vec; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_exp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#include <cmath>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
Expand All @@ -34,8 +35,8 @@ void exp_data(
const CTYPE_IN* in_data,
const size_t numel,
CTYPE_OUT* out_data) {
using Vec = executorch::vec::Vectorized<CTYPE_IN>;
executorch::vec::map<CTYPE_IN>(
using Vec = at::vec::Vectorized<CTYPE_IN>;
at::vec::map<CTYPE_IN>(
[](Vec x) { return x.exp(); }, out_data, in_data, numel);
}

Expand Down
11 changes: 6 additions & 5 deletions kernels/optimized/cpu/op_le.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>
Expand Down Expand Up @@ -44,8 +45,8 @@ Tensor& opt_le_tensor_out(
if (a_type == b_type && a_type == out_type) {
ET_SWITCH_REAL_TYPES_AND(
Bool, out_type, ctx, "le.Tensor_out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x.le(y); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -109,8 +110,8 @@ Tensor& opt_le_scalar_out(
CTYPE_B b_val = 0;
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x.le(Vec(b_casted)); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
19 changes: 10 additions & 9 deletions kernels/optimized/cpu/op_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
Expand Down Expand Up @@ -95,7 +96,7 @@ Tensor& handle_last_dim_broadcast(
const size_t outer_size = getLeadingDims(out, out.dim() - 1);
const auto broadcast_size = out.size(out.dim() - 1);
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_broadcast_last_dim<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -164,7 +165,7 @@ Tensor& handle_broadcast_mul(
inner_size = lhs->sizes()[lhs->dim() - 1];
}
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -203,8 +204,8 @@ Tensor& opt_mul_out(
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand All @@ -229,8 +230,8 @@ Tensor& opt_mul_out(
"Failed to resize output tensor.");

ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map2<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map2<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down Expand Up @@ -306,8 +307,8 @@ Tensor& opt_mul_scalar_out(
ET_EXTRACT_SCALAR(b, b_val);
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
using Vec = at::vec::Vectorized<CTYPE>;
at::vec::map<CTYPE>(
[b_casted](Vec x) { return x * Vec(b_casted); },
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
Expand Down
7 changes: 4 additions & 3 deletions kernels/optimized/cpu/op_native_layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
#include <cmath>
#include <tuple>

#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#include <executorch/kernels/optimized/cpu/moments_utils.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>

namespace torch {
Expand All @@ -33,7 +34,7 @@ void layer_norm(
Tensor& out,
Tensor& mean,
Tensor& rstd) {
using Vec = executorch::vec::Vectorized<CTYPE>;
using Vec = at::vec::Vectorized<CTYPE>;

const size_t dim = input.dim() - normalized_shape.size();
const size_t dim_size = input.size(dim);
Expand Down Expand Up @@ -93,7 +94,7 @@ void layer_norm(
dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v;
}
} else {
executorch::vec::map3<CTYPE>(
at::vec::map3<CTYPE>(
[scale, offset](Vec x, Vec gamma, Vec beta) {
return (x * Vec(scale) + Vec(offset)) * gamma + beta;
},
Expand Down
Loading

0 comments on commit 469c132

Please sign in to comment.