From 02bd5a7013c558f1e5220fc89bafa68f40276549 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Sun, 7 Apr 2024 21:06:47 +0100 Subject: [PATCH 01/10] [FMV][GlobalOpt] Bypass the IFunc Resolver of MultiVersioned functions. To deduce whether the optimization is legal we need to compare the target features between caller and callee versions. The criteria for bypassing the resolver are the following: * If the callee's feature set is a subset of the caller's feature set, then the callee is a candidate for direct call. * Among such candidates the one of highest priority is the best match and it shall be picked, unless there is a version of the callee with higher priority than the best match which cannot be picked from a higher priority caller (directly or through the resolver). * For every higher priority callee version than the best match, there is a higher priority caller version whose feature set availability is implied by the callee's feature set. Example: Callers and Callees are ordered in decreasing priority. The arrows indicate successful call redirections. Caller Callee Explanation ========================================================================= mops+sve2 --+--> mops all the callee versions are subsets of the | caller but mops has the highest priority | mops --+ sve2 between mops and default callees, mops wins sve sve between sve and default callees, sve wins but sve2 does not have a high priority caller default -----> default sve (callee) implies sve (caller), sve2(callee) implies sve (caller), mops(callee) implies mops(caller) --- .../llvm/Analysis/TargetTransformInfo.h | 14 + .../llvm/Analysis/TargetTransformInfoImpl.h | 4 + .../llvm/TargetParser/AArch64TargetParser.h | 4 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 6 + .../AArch64/AArch64TargetTransformInfo.cpp | 8 + .../AArch64/AArch64TargetTransformInfo.h | 4 + llvm/lib/TargetParser/AArch64TargetParser.cpp | 17 +- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 141 +++++- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 412 ++++++++++++++++++ 9 files changed, 604 insertions(+), 6 deletions(-) create mode 100644 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fa9392b86c15b9..49adecbc81e2bb 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1762,6 +1762,12 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns true if the target supports Function MultiVersioning. + bool hasFMV() const; + + /// Returns a bitmask constructed from the target features of a function. + uint64_t getFeatureMask(Function &F) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2152,6 +2158,8 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual bool hasFMV() const = 0; + virtual uint64_t getFeatureMask(Function &F) const = 0; virtual unsigned getMaxNumArgs() const = 0; }; @@ -2904,6 +2912,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + bool hasFMV() const override { return Impl.hasFMV(); } + + uint64_t getFeatureMask(Function &F) const override { + return Impl.getFeatureMask(F); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 63c2ef8912b29c..6b8cae928ff6e9 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -941,6 +941,10 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + bool hasFMV() const { return false; } + + uint64_t getFeatureMask(Function &F) const { return 0; } + unsigned getMaxNumArgs() const { return UINT_MAX; } protected: diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 805b963a7a13c7..152cfee8cf373d 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -846,6 +846,7 @@ const ArchInfo *getArchForCpu(StringRef CPU); // Parser const ArchInfo *parseArch(StringRef Arch); std::optional parseArchExtension(StringRef Extension); +std::optional parseTargetFeature(StringRef Feature); // Given the name of a CPU or alias, return the correponding CpuInfo. std::optional parseCpu(StringRef Name); // Used by target parser tests @@ -856,7 +857,8 @@ bool isX18ReservedByDefault(const Triple &TT); // For given feature names, return a bitmask corresponding to the entries of // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks // themselves, they are sequential (0, 1, 2, 3, ...). -uint64_t getCpuSupportsMask(ArrayRef FeatureStrs); +uint64_t getCpuSupportsMask(ArrayRef FeatureStrs, + bool IsBackEndFeature = false); void PrintSupportedExtensions(StringMap DescMap); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5f933b4587843c..3caca8a417d3ee 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1296,6 +1296,12 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +bool TargetTransformInfo::hasFMV() const { return TTIImpl->hasFMV(); } + +uint64_t TargetTransformInfo::getFeatureMask(Function &F) const { + return TTIImpl->getFeatureMask(F); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ee7137b92445bb..e68565ed16f06f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -231,6 +232,13 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const { + StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getCpuSupportsMask(Features, /*IsBackEndFeature = */ true); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index de39dea2be43e1..fe275341930ba5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -83,6 +83,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + bool hasFMV() const { return ST->hasFMV(); } + + uint64_t getFeatureMask(Function &F) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 71099462d5ecff..5eecde791a0336 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -47,12 +47,13 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA return {}; } -uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs) { +uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs, + bool IsBackEndFeature) { uint64_t FeaturesMask = 0; - for (const StringRef &FeatureStr : FeatureStrs) { - if (auto Ext = parseArchExtension(FeatureStr)) + for (const StringRef FeatureStr : FeatureStrs) + if (auto Ext = IsBackEndFeature ? parseTargetFeature(FeatureStr) + : parseArchExtension(FeatureStr)) FeaturesMask |= (1ULL << Ext->CPUFeature); - } return FeaturesMask; } @@ -132,6 +133,14 @@ std::optional AArch64::parseArchExtension(StringRef Arch return {}; } +std::optional +AArch64::parseTargetFeature(StringRef Feature) { + for (const auto &E : Extensions) + if (Feature == E.Feature) + return E; + return {}; +} + std::optional AArch64::parseCpu(StringRef Name) { // Resolve aliases first. Name = resolveCPUAlias(Name); diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index da714c9a75701b..159362058ef42c 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -89,7 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); STATISTIC(NumInternalFunc, "Number of internal functions"); STATISTIC(NumColdCC, "Number of functions marked coldcc"); -STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs"); +STATISTIC(NumIFuncsResolved, "Number of resolved IFuncs"); STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed"); static cl::opt @@ -2462,6 +2462,142 @@ DeleteDeadIFuncs(Module &M, return Changed; } +// Follows the use-def chain of \p V backwards until it finds a Function, +// in which case it collects in \p Versions. +static void collectVersions(Value *V, SmallVectorImpl &Versions) { + if (auto *F = dyn_cast(V)) { + Versions.push_back(F); + } else if (auto *Sel = dyn_cast(V)) { + collectVersions(Sel->getTrueValue(), Versions); + collectVersions(Sel->getFalseValue(), Versions); + } else if (auto *Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + collectVersions(Phi->getIncomingValue(I), Versions); + } +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked from a +// higher priority caller (directly or through the resolver). +// +// * For every higher priority callee version than the best match, there +// is a higher priority caller version whose feature set availability +// is implied by the callee's feature set. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeatureMask; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + if (!TTI.hasFMV()) + return false; + + // Discover the callee versions. + SmallVector Callees; + for (BasicBlock &BB : *Resolver) + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + collectVersions(Ret->getReturnValue(), Callees); + + if (Callees.empty()) + continue; + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeatureMask.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFeatureMask(*Callee); + } + + // Sort the callee versions in decreasing priority order. + sort(Callees, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector Callers; + DenseMap> CallSites; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller); + if (FeatInserted) + FeatIt->second = TTI.getFeatureMask(*Caller); + auto [CallIt, CallInserted] = CallSites.try_emplace(Caller); + if (CallInserted) + Callers.push_back(Caller); + CallIt->second.push_back(CB); + } + } + } + + // Sort the caller versions in decreasing priority order. + sort(Callers, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; }; + + // Index to the highest priority candidate. + unsigned I = 0; + // Now try to redirect calls starting from higher priority callers. + for (Function *Caller : Callers) { + // Getting here means we found callers of equal priority. + if (I == Callees.size()) + break; + Function *Callee = Callees[I]; + uint64_t CallerBits = FeatureMask[Caller]; + uint64_t CalleeBits = FeatureMask[Callee]; + // If the feature set of the caller implies the feature set of the + // highest priority candidate then it shall be picked. In case of + // identical sets advance the candidate index one position. + if (CallerBits == CalleeBits) + ++I; + else if (!implies(CallerBits, CalleeBits)) { + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + while (implies(CalleeBits, CallerBits)) { + if (++I == Callees.size()) + break; + CalleeBits = FeatureMask[Callees[I]]; + } + continue; + } + auto &Calls = CallSites[Caller]; + for (CallBase *CS : Calls) + CS->setCalledOperand(Callee); + Changed = true; + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2525,6 +2661,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 00000000000000..2805ce6fb2a3dc --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,412 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_same_priority_callers)" --version 4 +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$test_caller_feats_not_implied.resolver = comdat any +$test_same_priority_callers.resolver = comdat any +$foo.resolver = comdat any +$bar.resolver = comdat any +$goo.resolver = comdat any +$baz.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_single_bb_resolver +@test_multi_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_multi_bb_resolver +@test_caller_feats_not_implied.ifunc = weak_odr alias i32 (), ptr @test_caller_feats_not_implied +@test_same_priority_callers.ifunc = weak_odr alias i32 (), ptr @test_same_priority_callers +@foo.ifunc = weak_odr alias i32 (), ptr @foo +@bar.ifunc = weak_odr alias i32 (), ptr @bar +@goo.ifunc = weak_odr alias i32 (), ptr @goo +@baz.ifunc = weak_odr alias i32 (), ptr @baz + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver +@test_same_priority_callers = weak_odr ifunc i32 (), ptr @test_same_priority_callers.resolver +@foo = weak_odr ifunc i32 (), ptr @foo.resolver +@bar = weak_odr ifunc i32 (), ptr @bar.resolver +@goo = weak_odr ifunc i32 (), ptr @goo.resolver +@baz = weak_odr ifunc i32 (), ptr @baz.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver._Msve() #2 + +declare i32 @test_single_bb_resolver._Msve2() #3 + +define i32 @test_single_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_single_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_single_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @foo._Msve() #2 { +; CHECK-LABEL: define i32 @foo._Msve( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @foo._Msve2() #3 { +; CHECK-LABEL: define i32 @foo._Msve2( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL1:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; CHECK: [[CALL2:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call1 = tail call i32 @test_single_bb_resolver() + %call2 = tail call i32 @test_single_bb_resolver() + %added = add nsw i32 %call1, %call2 + %add = add nsw i32 %added, 20 + ret i32 %add +} + +define i32 @foo.default() #1 { +; CHECK-LABEL: define i32 @foo.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @foo.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @foo.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve + %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2 + ret ptr %common.ret.op +} + +define i32 @test_multi_bb_resolver._Mmops() #4 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Mmops( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +entry: + ret i32 3 +} + +define i32 @test_multi_bb_resolver._Msve2() #3 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve2( +; CHECK-SAME: ) #[[ATTR1]] { +entry: + ret i32 2 +} + +define i32 @test_multi_bb_resolver._Msve() #2 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_multi_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_multi_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @bar._MmopsMsve2() #5 { +; CHECK-LABEL: define i32 @bar._MmopsMsve2( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 40 + ret i32 %add +} + +define i32 @bar._Mmops() #4 { +; CHECK-LABEL: define i32 @bar._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @bar._Msve() #2 { +; CHECK-LABEL: define i32 @bar._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 20 + ret i32 %add +} + +define i32 @bar.default() #1 { +; CHECK-LABEL: define i32 @bar.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @bar.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @bar.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460821022900224 + %2 = icmp eq i64 %1, 576460821022900224 + %3 = and i64 %0, 1073741824 + %.not = icmp eq i64 %3, 0 + %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve + %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default + ret ptr %common.ret.op +} + +define i32 @test_caller_feats_not_implied._Mmops() #4 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Mmops( +; CHECK-SAME: ) #[[ATTR4]] { +entry: + ret i32 3 +} + +define i32 @test_caller_feats_not_implied._Msme() #6 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msme( +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +entry: + ret i32 2 +} + +define i32 @test_caller_feats_not_implied._Msve() #2 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_caller_feats_not_implied.default() #1 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_caller_feats_not_implied.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 4398046511104 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve + br label %common.ret +} + +define i32 @goo._Mmops() #4 { +; CHECK-LABEL: define i32 @goo._Mmops( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @goo._Msve() #2 { +; CHECK-LABEL: define i32 @goo._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @goo.default() #1 { +; CHECK-LABEL: define i32 @goo.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define weak_odr ptr @goo.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @goo.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %goo._Msve.goo.default = select i1 %.not3, ptr @goo.default, ptr @goo._Msve + %common.ret.op = select i1 %.not, ptr %goo._Msve.goo.default, ptr @goo._Mmops + ret ptr %common.ret.op +} + +define i32 @test_same_priority_callers._Msve() #2 { +; CHECK-LABEL: define i32 @test_same_priority_callers._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_same_priority_callers.default() #1 { +; CHECK-LABEL: define i32 @test_same_priority_callers.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_same_priority_callers.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_same_priority_callers.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 1073741824 + %.not = icmp eq i64 %1, 0 + %test_same_priority_callers._Msve.test_same_priority_callers.default = select i1 %.not, ptr @test_same_priority_callers.default, ptr @test_same_priority_callers._Msve + ret ptr %test_same_priority_callers._Msve.test_same_priority_callers.default +} + +define dso_local i32 @baz._Msve() #2 { +; CHECK-LABEL: define dso_local i32 @baz._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_same_priority_callers._Msve() +; +entry: + %call = tail call i32 @test_same_priority_callers() + ret i32 %call +} + +define i32 @baz._Maes() #1 { +; CHECK-LABEL: define i32 @baz._Maes( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_same_priority_callers() +; +entry: + %call = tail call i32 @test_same_priority_callers() + ret i32 %call +} + +; This isn't a bug in globalopt, but rather a problematic input. +; The 'aes' extension does not add any target features on top +; of what is inherited from the command line. +; +; What happens is that since baz._Maes and baz.default have the same priority, +; globalopt tries to optimize the call in baz.default first and succeeds leaving +; the remaining call in baz._Maes pointing to the resolver. +; +define dso_local i32 @baz.default() #1 { +; CHECK-LABEL: define dso_local i32 @baz.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_same_priority_callers.default() +; +entry: + %call = tail call i32 @test_same_priority_callers() + ret i32 %call +} + +define weak_odr ptr @baz.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @baz.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 1073741824 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 16384 + %.not3 = icmp eq i64 %2, 0 + %baz._Maes.baz.default = select i1 %.not3, ptr @baz.default, ptr @baz._Maes + %common.ret.op = select i1 %.not, ptr %baz._Maes.baz.default, ptr @baz._Msve + ret ptr %common.ret.op +} + +attributes #0 = { "target-features"="+fmv" } +attributes #1 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } +attributes #3 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } +attributes #4 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" } +attributes #5 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" } +attributes #6 = { "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" } From 16aa3baf9d0c354611b9d270a85c9d458302a92a Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Wed, 13 Nov 2024 19:48:27 +0000 Subject: [PATCH 02/10] Changes from last revision: * clang format * remove leftover target hook hasFMV after rebase * remove filter in regression test after rebase --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 -- llvm/lib/TargetParser/AArch64TargetParser.cpp | 2 +- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 2 +- llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index c4cfd5bfe82cf1..5d6663a4a0c146 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -88,8 +88,6 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; - bool hasFMV() const { return ST->hasFMV(); } - uint64_t getFeatureMask(Function &F) const; /// \name Scalar TTI Implementations diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 0b1a7bacdaa5ab..588ea9a5dba42a 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -52,7 +52,7 @@ std::optional lookupFMVByID(llvm::AArch64::ArchExtKind ExtID) { for (const auto &I : llvm::AArch64::getFMVInfo()) if (I.ID && *I.ID == ExtID) - return I; + return I; return {}; } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 3c2aba774e69c2..a427fbc2f7ea9b 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2753,7 +2753,7 @@ static bool OptimizeNonTrivialIFuncs( unsigned I = 0; // Now try to redirect calls starting from higher priority callers. for (Function *Caller : Callers) { - assert (I < Callees.size() && "Found callers of equal priority"); + assert(I < Callees.size() && "Found callers of equal priority"); Function *Callee = Callees[I]; uint64_t CallerBits = FeatureMask[Caller]; diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 91e991a778fa11..8e0072c3416b5f 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_same_priority_callers)" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied)" --version 4 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" From 052cef872c87a18794a7f243e995a63e4b012b78 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 28 Nov 2024 23:06:28 +0000 Subject: [PATCH 03/10] Changes from last revision: Use FMV priority mask when sorting candidates --- .../TargetParser/AArch64FeatPriorities.inc | 66 +++++++++++ .../llvm/TargetParser/AArch64TargetParser.h | 15 ++- llvm/lib/Target/AArch64/AArch64FMV.td | 105 +++++++++--------- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/lib/TargetParser/AArch64TargetParser.cpp | 33 ++++-- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 90 +++------------ llvm/utils/TableGen/ARMTargetDefEmitter.cpp | 4 +- 7 files changed, 169 insertions(+), 146 deletions(-) create mode 100644 llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc diff --git a/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc new file mode 100644 index 00000000000000..96af618032aea3 --- /dev/null +++ b/llvm/include/llvm/TargetParser/AArch64FeatPriorities.inc @@ -0,0 +1,66 @@ +//===- AArch64FeatPriorities.inc - AArch64 FMV Priorities enum --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file enumerates the AArch64 FMV features sorted in ascending priority. +// +//===----------------------------------------------------------------------===// + +#ifndef AARCH64_FEAT_PRIORITIES_INC_H +#define AARCH64_FEAT_PRIORITIES_INC_H + +// Function Multi Versioning feature priorities. +enum FeatPriorities { + PRIOR_RNG, + PRIOR_FLAGM, + PRIOR_FLAGM2, + PRIOR_LSE, + PRIOR_FP, + PRIOR_SIMD, + PRIOR_DOTPROD, + PRIOR_SM4, + PRIOR_RDM, + PRIOR_CRC, + PRIOR_SHA2, + PRIOR_SHA3, + PRIOR_PMULL, + PRIOR_FP16, + PRIOR_FP16FML, + PRIOR_DIT, + PRIOR_DPB, + PRIOR_DPB2, + PRIOR_JSCVT, + PRIOR_FCMA, + PRIOR_RCPC, + PRIOR_RCPC2, + PRIOR_RCPC3, + PRIOR_FRINTTS, + PRIOR_I8MM, + PRIOR_BF16, + PRIOR_SVE, + PRIOR_SVE_F32MM, + PRIOR_SVE_F64MM, + PRIOR_SVE2, + PRIOR_SVE_PMULL128, + PRIOR_SVE_BITPERM, + PRIOR_SVE_SHA3, + PRIOR_SVE_SM4, + PRIOR_SME, + PRIOR_MEMTAG2, + PRIOR_SB, + PRIOR_PREDRES, + PRIOR_SSBS2, + PRIOR_BTI, + PRIOR_LS64_ACCDATA, + PRIOR_WFXT, + PRIOR_SME_F64, + PRIOR_SME_I64, + PRIOR_SME2, + PRIOR_MOPS +}; + +#endif diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 1311329821828f..bd9354b4e7fa1f 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -35,6 +35,7 @@ struct ArchInfo; struct CpuInfo; #include "llvm/TargetParser/AArch64CPUFeatures.inc" +#include "llvm/TargetParser/AArch64FeatPriorities.inc" static_assert(FEAT_MAX < 62, "Number of features in CPUFeatures are limited to 62 entries"); @@ -69,12 +70,12 @@ struct ExtensionInfo { struct FMVInfo { StringRef Name; // The target_version/target_clones spelling. - CPUFeatures Bit; // Index of the bit in the FMV feature bitset. + CPUFeatures FeatureBit; // Index of the bit in the FMV feature bitset. std::optional ID; // The architecture extension to enable. - unsigned Priority; // FMV priority. - FMVInfo(StringRef Name, CPUFeatures Bit, std::optional ID, - unsigned Priority) - : Name(Name), Bit(Bit), ID(ID), Priority(Priority) {}; + FeatPriorities PriorityBit; // FMV priority. + FMVInfo(StringRef Name, CPUFeatures FeatureBit, std::optional ID, + FeatPriorities PriorityBit) + : Name(Name), FeatureBit(FeatureBit), ID(ID), PriorityBit(PriorityBit){}; }; const std::vector &getFMVInfo(); @@ -271,6 +272,10 @@ bool isX18ReservedByDefault(const Triple &TT); // Return the priority for a given set of FMV features. unsigned getFMVPriority(ArrayRef Features); +// For given feature names, return a bitmask corresponding to the entries of +// AArch64::FeatPriorities. +uint64_t getPriorityMask(ArrayRef Features); + // For given feature names, return a bitmask corresponding to the entries of // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks // themselves, they are sequential (0, 1, 2, 3, ...). diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td index fc7a94a5fe475f..e0f56fd5556196 100644 --- a/llvm/lib/Target/AArch64/AArch64FMV.td +++ b/llvm/lib/Target/AArch64/AArch64FMV.td @@ -22,64 +22,65 @@ // Something you can add to target_version or target_clones. -class FMVExtension { +class FMVExtension { // Name, as spelled in target_version or target_clones. e.g. "memtag". - string Name = n; + string Name = name; // A C++ expression giving the number of the bit in the FMV ABI. // Currently this is given as a value from the enum "CPUFeatures". - string Bit = b; + string FeatureBit = "FEAT_" # enumeration; // SubtargetFeature enabled for codegen when this FMV feature is present. - string BackendFeature = n; + string BackendFeature = name; - // The FMV priority. - int Priority = p; + // A C++ expression giving the number of the priority bit. + // Currently this is given as a value from the enum "FeatPriorities". + string PriorityBit = "PRIOR_" # enumeration; } -def : FMVExtension<"aes", "FEAT_PMULL", 150>; -def : FMVExtension<"bf16", "FEAT_BF16", 280>; -def : FMVExtension<"bti", "FEAT_BTI", 510>; -def : FMVExtension<"crc", "FEAT_CRC", 110>; -def : FMVExtension<"dit", "FEAT_DIT", 180>; -def : FMVExtension<"dotprod", "FEAT_DOTPROD", 104>; -let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "FEAT_DPB", 190>; -let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "FEAT_DPB2", 200>; -def : FMVExtension<"f32mm", "FEAT_SVE_F32MM", 350>; -def : FMVExtension<"f64mm", "FEAT_SVE_F64MM", 360>; -def : FMVExtension<"fcma", "FEAT_FCMA", 220>; -def : FMVExtension<"flagm", "FEAT_FLAGM", 20>; -let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FEAT_FLAGM2", 30>; -def : FMVExtension<"fp", "FEAT_FP", 90>; -def : FMVExtension<"fp16", "FEAT_FP16", 170>; -def : FMVExtension<"fp16fml", "FEAT_FP16FML", 175>; -let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FEAT_FRINTTS", 250>; -def : FMVExtension<"i8mm", "FEAT_I8MM", 270>; -def : FMVExtension<"jscvt", "FEAT_JSCVT", 210>; -def : FMVExtension<"ls64", "FEAT_LS64_ACCDATA", 520>; -def : FMVExtension<"lse", "FEAT_LSE", 80>; -def : FMVExtension<"memtag", "FEAT_MEMTAG2", 440>; -def : FMVExtension<"mops", "FEAT_MOPS", 650>; -def : FMVExtension<"predres", "FEAT_PREDRES", 480>; -def : FMVExtension<"rcpc", "FEAT_RCPC", 230>; -let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "FEAT_RCPC2", 240>; -def : FMVExtension<"rcpc3", "FEAT_RCPC3", 241>; -def : FMVExtension<"rdm", "FEAT_RDM", 108>; -def : FMVExtension<"rng", "FEAT_RNG", 10>; -def : FMVExtension<"sb", "FEAT_SB", 470>; -def : FMVExtension<"sha2", "FEAT_SHA2", 130>; -def : FMVExtension<"sha3", "FEAT_SHA3", 140>; -def : FMVExtension<"simd", "FEAT_SIMD", 100>; -def : FMVExtension<"sm4", "FEAT_SM4", 106>; -def : FMVExtension<"sme", "FEAT_SME", 430>; -def : FMVExtension<"sme-f64f64", "FEAT_SME_F64", 560>; -def : FMVExtension<"sme-i16i64", "FEAT_SME_I64", 570>; -def : FMVExtension<"sme2", "FEAT_SME2", 580>; -def : FMVExtension<"ssbs", "FEAT_SSBS2", 490>; -def : FMVExtension<"sve", "FEAT_SVE", 310>; -def : FMVExtension<"sve2", "FEAT_SVE2", 370>; -def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", 380>; -def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", 400>; -def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", 410>; -def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", 420>; -def : FMVExtension<"wfxt", "FEAT_WFXT", 550>; +def : FMVExtension<"aes", "PMULL">; +def : FMVExtension<"bf16", "BF16">; +def : FMVExtension<"bti", "BTI">; +def : FMVExtension<"crc", "CRC">; +def : FMVExtension<"dit", "DIT">; +def : FMVExtension<"dotprod", "DOTPROD">; +let BackendFeature = "ccpp" in def : FMVExtension<"dpb", "DPB">; +let BackendFeature = "ccdp" in def : FMVExtension<"dpb2", "DPB2">; +def : FMVExtension<"f32mm", "SVE_F32MM">; +def : FMVExtension<"f64mm", "SVE_F64MM">; +def : FMVExtension<"fcma", "FCMA">; +def : FMVExtension<"flagm", "FLAGM">; +let BackendFeature = "altnzcv" in def : FMVExtension<"flagm2", "FLAGM2">; +def : FMVExtension<"fp", "FP">; +def : FMVExtension<"fp16", "FP16">; +def : FMVExtension<"fp16fml", "FP16FML">; +let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">; +def : FMVExtension<"i8mm", "I8MM">; +def : FMVExtension<"jscvt", "JSCVT">; +def : FMVExtension<"ls64", "LS64_ACCDATA">; +def : FMVExtension<"lse", "LSE">; +def : FMVExtension<"memtag", "MEMTAG2">; +def : FMVExtension<"mops", "MOPS">; +def : FMVExtension<"predres", "PREDRES">; +def : FMVExtension<"rcpc", "RCPC">; +let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">; +def : FMVExtension<"rcpc3", "RCPC3">; +def : FMVExtension<"rdm", "RDM">; +def : FMVExtension<"rng", "RNG">; +def : FMVExtension<"sb", "SB">; +def : FMVExtension<"sha2", "SHA2">; +def : FMVExtension<"sha3", "SHA3">; +def : FMVExtension<"simd", "SIMD">; +def : FMVExtension<"sm4", "SM4">; +def : FMVExtension<"sme", "SME">; +def : FMVExtension<"sme-f64f64", "SME_F64">; +def : FMVExtension<"sme-i16i64", "SME_I64">; +def : FMVExtension<"sme2", "SME2">; +def : FMVExtension<"ssbs", "SSBS2">; +def : FMVExtension<"sve", "SVE">; +def : FMVExtension<"sve2", "SVE2">; +def : FMVExtension<"sve2-aes", "SVE_PMULL128">; +def : FMVExtension<"sve2-bitperm", "SVE_BITPERM">; +def : FMVExtension<"sve2-sha3", "SVE_SHA3">; +def : FMVExtension<"sve2-sm4", "SVE_SM4">; +def : FMVExtension<"wfxt", "WFXT">; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a81e7935e59685..56b9a40557baaf 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -252,7 +252,7 @@ uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const { FeatureStr.split(Features, ","); if (none_of(Features, [](StringRef Feat) { return Feat == "+fmv"; })) return 0; - return AArch64::getCpuSupportsMask(Features); + return AArch64::getPriorityMask(Features); } bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 5313cdced8981b..83d177afd904b6 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -57,26 +57,35 @@ lookupFMVByID(llvm::AArch64::ArchExtKind ExtID) { } unsigned AArch64::getFMVPriority(ArrayRef Features) { - constexpr unsigned MaxFMVPriority = 1000; - unsigned Priority = 0; - unsigned NumFeatures = 0; + constexpr unsigned MaxFMVPriority = 100; + uint64_t Priority = 0; + FeatPriorities TopBit = static_cast(0); for (StringRef Feature : Features) { - if (auto Ext = parseFMVExtension(Feature)) { - Priority = std::max(Priority, Ext->Priority); - NumFeatures++; + if (auto FMVExt = parseFMVExtension(Feature)) { + TopBit = std::max(TopBit, FMVExt->PriorityBit); + Priority |= (1ULL << FMVExt->PriorityBit); } } - return Priority + MaxFMVPriority * NumFeatures; + return TopBit + MaxFMVPriority * popcount(Priority); +} + +uint64_t AArch64::getPriorityMask(ArrayRef Features) { + uint64_t PriorityMask = 0; + for (StringRef Feature : Features) { + if (auto FMVExt = parseFMVExtension(Feature)) + PriorityMask |= (1ULL << FMVExt->PriorityBit); + else if (auto ArchExt = targetFeatureToExtension(Feature)) + if (auto FMVExt = lookupFMVByID(ArchExt->ID)) + PriorityMask |= (1ULL << FMVExt->PriorityBit); + } + return PriorityMask; } uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs) { uint64_t FeaturesMask = 0; for (const StringRef &FeatureStr : FeatureStrs) { - if (auto FMVExt = parseFMVExtension(FeatureStr)) - FeaturesMask |= (1ULL << FMVExt->Bit); - else if (auto ArchExt = targetFeatureToExtension(FeatureStr)) - if (auto FMVExt = lookupFMVByID(ArchExt->ID)) - FeaturesMask |= (1ULL << FMVExt->Bit); + if (auto Ext = parseFMVExtension(FeatureStr)) + FeaturesMask |= (1ULL << Ext->FeatureBit); } return FeaturesMask; } diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 8e0072c3416b5f..fb89a7c06489df 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -7,18 +7,12 @@ target triple = "aarch64-unknown-linux-gnu" $test_single_bb_resolver.resolver = comdat any $test_multi_bb_resolver.resolver = comdat any $test_caller_feats_not_implied.resolver = comdat any -$foo.resolver = comdat any -$bar.resolver = comdat any -$goo.resolver = comdat any @__aarch64_cpu_features = external local_unnamed_addr global { i64 } @test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver @test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver -@foo = weak_odr ifunc i32 (), ptr @foo.resolver -@bar = weak_odr ifunc i32 (), ptr @bar.resolver -@goo = weak_odr ifunc i32 (), ptr @goo.resolver declare void @__init_cpu_features_resolver() local_unnamed_addr @@ -45,50 +39,32 @@ resolver_entry: define i32 @foo._Msve() #1 { ; CHECK-LABEL: define i32 @foo._Msve( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() ; entry: %call = tail call i32 @test_single_bb_resolver() - %add = add nsw i32 %call, 30 - ret i32 %add + ret i32 %call } define i32 @foo._Msve2() #2 { ; CHECK-LABEL: define i32 @foo._Msve2( -; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() ; entry: %call = tail call i32 @test_single_bb_resolver() - %add = add nsw i32 %call, 20 - ret i32 %add + ret i32 %call } define i32 @foo.default() #0 { ; CHECK-LABEL: define i32 @foo.default( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() ; entry: %call = tail call i32 @test_single_bb_resolver() - %add = add nsw i32 %call, 10 - ret i32 %add -} - -define weak_odr ptr @foo.resolver() #0 comdat { -; CHECK-LABEL: define weak_odr ptr @foo.resolver( -; CHECK-SAME: ) #[[ATTR0]] comdat { -resolver_entry: - tail call void @__init_cpu_features_resolver() - %0 = load i64, ptr @__aarch64_cpu_features, align 8 - %1 = and i64 %0, 68719476736 - %.not = icmp eq i64 %1, 0 - %2 = and i64 %0, 1073741824 - %.not3 = icmp eq i64 %2, 0 - %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve - %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2 - ret ptr %common.ret.op + ret i32 %call } declare i32 @test_multi_bb_resolver._Mmops() #3 @@ -127,13 +103,12 @@ resolver_else2: ; preds = %resolver_else define i32 @bar._MmopsMsve2() #4 { ; CHECK-LABEL: define i32 @bar._MmopsMsve2( -; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() ; entry: %call = tail call i32 @test_multi_bb_resolver() - %add = add nsw i32 %call, 40 - ret i32 %add + ret i32 %call } define i32 @bar._Mmops() #3 { @@ -143,45 +118,27 @@ define i32 @bar._Mmops() #3 { ; entry: %call = tail call i32 @test_multi_bb_resolver() - %add = add nsw i32 %call, 30 - ret i32 %add + ret i32 %call } define i32 @bar._Msve() #1 { ; CHECK-LABEL: define i32 @bar._Msve( -; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() ; entry: %call = tail call i32 @test_multi_bb_resolver() - %add = add nsw i32 %call, 20 - ret i32 %add + ret i32 %call } define i32 @bar.default() #0 { ; CHECK-LABEL: define i32 @bar.default( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() ; entry: %call = tail call i32 @test_multi_bb_resolver() - %add = add nsw i32 %call, 10 - ret i32 %add -} - -define weak_odr ptr @bar.resolver() #0 comdat { -; CHECK-LABEL: define weak_odr ptr @bar.resolver( -; CHECK-SAME: ) #[[ATTR0]] comdat { -resolver_entry: - tail call void @__init_cpu_features_resolver() - %0 = load i64, ptr @__aarch64_cpu_features, align 8 - %1 = and i64 %0, 576460821022900224 - %2 = icmp eq i64 %1, 576460821022900224 - %3 = and i64 %0, 1073741824 - %.not = icmp eq i64 %3, 0 - %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve - %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default - ret ptr %common.ret.op + ret i32 %call } declare i32 @test_caller_feats_not_implied._Mmops() #3 @@ -220,7 +177,7 @@ resolver_else2: ; preds = %resolver_else define i32 @goo._Mmops() #3 { ; CHECK-LABEL: define i32 @goo._Mmops( -; CHECK-SAME: ) #[[ATTR3]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() ; entry: @@ -230,7 +187,7 @@ entry: define i32 @goo._Msve() #1 { ; CHECK-LABEL: define i32 @goo._Msve( -; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() ; entry: @@ -240,7 +197,7 @@ entry: define i32 @goo.default() #0 { ; CHECK-LABEL: define i32 @goo.default( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() ; entry: @@ -248,21 +205,6 @@ entry: ret i32 %call } -define weak_odr ptr @goo.resolver() #0 comdat { -; CHECK-LABEL: define weak_odr ptr @goo.resolver( -; CHECK-SAME: ) #[[ATTR0]] comdat { -resolver_entry: - tail call void @__init_cpu_features_resolver() - %0 = load i64, ptr @__aarch64_cpu_features, align 8 - %1 = and i64 %0, 576460752303423488 - %.not = icmp eq i64 %1, 0 - %2 = and i64 %0, 1073741824 - %.not3 = icmp eq i64 %2, 0 - %goo._Msve.goo.default = select i1 %.not3, ptr @goo.default, ptr @goo._Msve - %common.ret.op = select i1 %.not, ptr %goo._Msve.goo.default, ptr @goo._Mmops - ret ptr %common.ret.op -} - attributes #0 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } attributes #1 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } diff --git a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp index 3b02f63e9490b1..a8c7acbcd1dd1f 100644 --- a/llvm/utils/TableGen/ARMTargetDefEmitter.cpp +++ b/llvm/utils/TableGen/ARMTargetDefEmitter.cpp @@ -162,14 +162,14 @@ static void emitARMTargetDef(const RecordKeeper &RK, raw_ostream &OS) { for (const Record *Rec : FMVExts) { OS << " I.emplace_back("; OS << "\"" << Rec->getValueAsString("Name") << "\""; - OS << ", " << Rec->getValueAsString("Bit"); + OS << ", " << Rec->getValueAsString("FeatureBit"); auto FeatName = Rec->getValueAsString("BackendFeature"); const Record *FeatRec = ExtensionMap[FeatName]; if (FeatRec) OS << ", " << FeatRec->getValueAsString("ArchExtKindSpelling").upper(); else OS << ", std::nullopt"; - OS << ", " << (uint64_t)Rec->getValueAsInt("Priority"); + OS << ", " << Rec->getValueAsString("PriorityBit"); OS << ");\n"; }; OS << " return I;\n" From 5314bc2e3de316eae1a69c3ab0e48f5c9fe7d010 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Sat, 30 Nov 2024 13:34:51 +0000 Subject: [PATCH 04/10] Changes from last revision Allow the optimization when the caller is non FMV but the attributes match. --- .../llvm/Analysis/TargetTransformInfo.h | 8 +++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 2 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +++ .../AArch64/AArch64TargetTransformInfo.cpp | 9 ++++-- .../AArch64/AArch64TargetTransformInfo.h | 2 ++ llvm/lib/Transforms/IPO/GlobalOpt.cpp | 19 ++++------- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 32 ++++++++++++++++++- 7 files changed, 61 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 12601e39294869..ee3163fd9a599d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1844,6 +1844,9 @@ class TargetTransformInfo { /// Returns a bitmask constructed from the target features of a function. uint64_t getFeatureMask(Function &F) const; + /// Returns true if this is an instance of a function with multiple versions. + bool isMultiversionedFunction(Function &F) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2270,6 +2273,7 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual uint64_t getFeatureMask(Function &F) const = 0; + virtual bool isMultiversionedFunction(Function &F) const = 0; virtual unsigned getMaxNumArgs() const = 0; virtual unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const = 0; @@ -3090,6 +3094,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getFeatureMask(F); } + bool isMultiversionedFunction(Function &F) const override { + return Impl.isMultiversionedFunction(F); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 6034c3c84135cb..ecedb9c52cb26c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1026,6 +1026,8 @@ class TargetTransformInfoImplBase { uint64_t getFeatureMask(Function &F) const { return 0; } + bool isMultiversionedFunction(Function &F) const { return false; } + unsigned getMaxNumArgs() const { return UINT_MAX; } unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e0ced05ecf10f2..bc0557d721de8d 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1364,6 +1364,10 @@ uint64_t TargetTransformInfo::getFeatureMask(Function &F) const { return TTIImpl->getFeatureMask(F); } +bool TargetTransformInfo::isMultiversionedFunction(Function &F) const { + return TTIImpl->isMultiversionedFunction(F); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 56b9a40557baaf..82fb90cbf8d905 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -250,11 +250,16 @@ uint64_t AArch64TTIImpl::getFeatureMask(Function &F) const { StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString(); SmallVector Features; FeatureStr.split(Features, ","); - if (none_of(Features, [](StringRef Feat) { return Feat == "+fmv"; })) - return 0; return AArch64::getPriorityMask(Features); } +bool AArch64TTIImpl::isMultiversionedFunction(Function &F) const { + StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return any_of(Features, [](StringRef Feat) { return Feat == "+fmv"; }); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 1a26a55282a028..cb0e7cd496f250 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -90,6 +90,8 @@ class AArch64TTIImpl : public BasicTTIImplBase { uint64_t getFeatureMask(Function &F) const; + bool isMultiversionedFunction(Function &F) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index f6fc0a9c2f1675..442487d242664a 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2702,24 +2702,19 @@ static bool OptimizeNonTrivialIFuncs( TargetTransformInfo &TTI = GetTTI(*Resolver); + // This IFunc is not FMV. + if (any_of(Callees, [&TTI](Function *F) { + return !TTI.isMultiversionedFunction(*F); + })) + continue; + // Cache the feature mask for each callee. - bool IsFMV = true; for (Function *Callee : Callees) { auto [It, Inserted] = FeatureMask.try_emplace(Callee); - if (Inserted) { + if (Inserted) It->second = TTI.getFeatureMask(*Callee); - // Empty mask means this isn't an FMV callee. - if (It->second == 0) { - IsFMV = false; - break; - } - } } - // This IFunc is not FMV. - if (!IsFMV) - continue; - // Sort the callee versions in decreasing priority order. sort(Callees, [&](auto *LHS, auto *RHS) { return FeatureMask[LHS] > FeatureMask[RHS]; diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index fb89a7c06489df..c6fca86a796394 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied)" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller)" --version 4 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -7,12 +7,14 @@ target triple = "aarch64-unknown-linux-gnu" $test_single_bb_resolver.resolver = comdat any $test_multi_bb_resolver.resolver = comdat any $test_caller_feats_not_implied.resolver = comdat any +$test_non_fmv_caller.resolver = comdat any @__aarch64_cpu_features = external local_unnamed_addr global { i64 } @test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver @test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver +@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver declare void @__init_cpu_features_resolver() local_unnamed_addr @@ -205,9 +207,37 @@ entry: ret i32 %call } +declare i32 @test_non_fmv_caller._Maes() #6 + +declare i32 @test_non_fmv_caller.default() #0 + +define weak_odr ptr @test_non_fmv_caller.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver( +; CHECK-SAME: ) #[[ATTR0]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 32768 + %.not = icmp eq i64 %1, 0 + %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes + ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default +} + +define i32 @baz() #7 { +; CHECK-LABEL: define i32 @baz( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + attributes #0 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } attributes #1 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } attributes #3 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" } attributes #4 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" } attributes #5 = { "target-features"="+bf16,+fmv,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" } +attributes #6 = { "target-features"="+aes,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #7 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } From 2c3b4d17cd81f75b6ebd2081bcfdc441408e691a Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Sat, 30 Nov 2024 14:51:04 +0000 Subject: [PATCH 05/10] Changes from last revision Add a problematic test case --- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 64 ++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index c6fca86a796394..4284756ec939f5 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller)" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority)" --version 4 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -8,6 +8,7 @@ $test_single_bb_resolver.resolver = comdat any $test_multi_bb_resolver.resolver = comdat any $test_caller_feats_not_implied.resolver = comdat any $test_non_fmv_caller.resolver = comdat any +$test_priority.resolver = comdat any @__aarch64_cpu_features = external local_unnamed_addr global { i64 } @@ -15,6 +16,7 @@ $test_non_fmv_caller.resolver = comdat any @test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver @test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver +@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver declare void @__init_cpu_features_resolver() local_unnamed_addr @@ -233,6 +235,62 @@ entry: ret i32 %call } +declare i32 @test_priority._Msve2-sha3() #8 + +declare i32 @test_priority._Mls64Mssbs() #9 + +declare i32 @test_priority._MflagmMlseMrng() #10 + +declare i32 @test_priority.default() #0 + +define weak_odr ptr @test_priority.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_priority.resolver( +; CHECK-SAME: ) #[[ATTR0]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 131 + %2 = icmp eq i64 %1, 131 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 9570149208162304 + %4 = icmp eq i64 %3, 9570149208162304 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 1099511627776 + %.not = icmp eq i64 %5, 0 + %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3 + br label %common.ret +} + +; FIXME: This is wrong. When generating the resolver the selection algorithm +; orders the function versions according to the number of specified features, +; from highest to lowest. (Note: I am going on a tangent here but in the case +; of a tie, the version with the highest priority feature is preferred. This +; is non deterministic if the highest priority feature is common. For example +; mops+sve vs mops+sve2). +; +; In this example the problem is slightly different. When in IR we can't know +; what were the features before their dependencies got expanded. Therefore +; we can select based on highest priority feature, then second, then third, +; etc... That's what we should be doing in the front-end too if you ask me. +; +define i32 @hoo._MflagmMls64MlseMrngMssbsMsve2-sha3() #11 { +; CHECK-LABEL: define i32 @hoo._MflagmMls64MlseMrngMssbsMsve2-sha3( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR11:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() +; +entry: + %call = tail call i32 @test_priority() + ret i32 %call +} + attributes #0 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } attributes #1 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } @@ -241,3 +299,7 @@ attributes #4 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outli attributes #5 = { "target-features"="+bf16,+fmv,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" } attributes #6 = { "target-features"="+aes,+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } attributes #7 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #8 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sha2,+sha3,+sve,+sve2,+sve2-sha3,+v8a" } +attributes #9 = { "target-features"="+fmv,+fp-armv8,+ls64,+neon,+outline-atomics,+ssbs,+v8a" } +attributes #10 = { "target-features"="+flagm,+fmv,+fp-armv8,+lse,+neon,+outline-atomics,+rand,+v8a" } +attributes #11 = { "target-features"="+flagm,+fmv,+fp-armv8,+fullfp16,+ls64,+lse,+neon,+outline-atomics,+rand,+sha2,+sha3,+ssbs,+sve,+sve2,+sve2-sha3,+v8a" } From 2b10388269a3675049a72a48cc0bff14ff10ea05 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Mon, 2 Dec 2024 10:16:39 +0000 Subject: [PATCH 06/10] clang-format --- llvm/include/llvm/TargetParser/AArch64TargetParser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index bd9354b4e7fa1f..09b02615fd5e63 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -75,7 +75,7 @@ struct FMVInfo { FeatPriorities PriorityBit; // FMV priority. FMVInfo(StringRef Name, CPUFeatures FeatureBit, std::optional ID, FeatPriorities PriorityBit) - : Name(Name), FeatureBit(FeatureBit), ID(ID), PriorityBit(PriorityBit){}; + : Name(Name), FeatureBit(FeatureBit), ID(ID), PriorityBit(PriorityBit) {}; }; const std::vector &getFMVInfo(); From 751d4e4dead0ff78b86cb40836f465f3bd09dc5a Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 9 Jan 2025 10:44:24 +0000 Subject: [PATCH 07/10] Minor adjustments in test file: * Rename caller function names * Remove whitespace between function version declarations. --- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 60 ++++++++----------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index dad0e131b26dcc..bbfbedb2243dc4 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -21,9 +21,7 @@ $test_priority.resolver = comdat any declare void @__init_cpu_features_resolver() local_unnamed_addr declare i32 @test_single_bb_resolver.default() #0 - declare i32 @test_single_bb_resolver._Msve() #1 - declare i32 @test_single_bb_resolver._Msve2() #2 define weak_odr ptr @test_single_bb_resolver.resolver() comdat { @@ -40,8 +38,8 @@ resolver_entry: ret ptr %common.ret.op } -define i32 @foo._Msve() #1 { -; CHECK-LABEL: define i32 @foo._Msve( +define i32 @caller1._Msve() #1 { +; CHECK-LABEL: define i32 @caller1._Msve( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() ; @@ -50,8 +48,8 @@ entry: ret i32 %call } -define i32 @foo._Msve2() #2 { -; CHECK-LABEL: define i32 @foo._Msve2( +define i32 @caller1._Msve2() #2 { +; CHECK-LABEL: define i32 @caller1._Msve2( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() ; @@ -60,8 +58,8 @@ entry: ret i32 %call } -define i32 @foo.default() #0 { -; CHECK-LABEL: define i32 @foo.default( +define i32 @caller1.default() #0 { +; CHECK-LABEL: define i32 @caller1.default( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() ; @@ -71,11 +69,8 @@ entry: } declare i32 @test_multi_bb_resolver._Mmops() #3 - declare i32 @test_multi_bb_resolver._Msve2() #2 - declare i32 @test_multi_bb_resolver._Msve() #1 - declare i32 @test_multi_bb_resolver.default() #0 define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { @@ -103,8 +98,8 @@ resolver_else2: ; preds = %resolver_else br label %common.ret } -define i32 @bar._MmopsMsve2() #4 { -; CHECK-LABEL: define i32 @bar._MmopsMsve2( +define i32 @caller2._MmopsMsve2() #4 { +; CHECK-LABEL: define i32 @caller2._MmopsMsve2( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() ; @@ -113,8 +108,8 @@ entry: ret i32 %call } -define i32 @bar._Mmops() #3 { -; CHECK-LABEL: define i32 @bar._Mmops( +define i32 @caller2._Mmops() #3 { +; CHECK-LABEL: define i32 @caller2._Mmops( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() ; @@ -123,8 +118,8 @@ entry: ret i32 %call } -define i32 @bar._Msve() #1 { -; CHECK-LABEL: define i32 @bar._Msve( +define i32 @caller2._Msve() #1 { +; CHECK-LABEL: define i32 @caller2._Msve( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() ; @@ -133,8 +128,8 @@ entry: ret i32 %call } -define i32 @bar.default() #0 { -; CHECK-LABEL: define i32 @bar.default( +define i32 @caller2.default() #0 { +; CHECK-LABEL: define i32 @caller2.default( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() ; @@ -144,11 +139,8 @@ entry: } declare i32 @test_caller_feats_not_implied._Mmops() #3 - declare i32 @test_caller_feats_not_implied._Msme() #5 - declare i32 @test_caller_feats_not_implied._Msve() #1 - declare i32 @test_caller_feats_not_implied.default() #0 define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { @@ -176,8 +168,8 @@ resolver_else2: ; preds = %resolver_else br label %common.ret } -define i32 @goo._Mmops() #3 { -; CHECK-LABEL: define i32 @goo._Mmops( +define i32 @caller3._Mmops() #3 { +; CHECK-LABEL: define i32 @caller3._Mmops( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() ; @@ -186,8 +178,8 @@ entry: ret i32 %call } -define i32 @goo._Msve() #1 { -; CHECK-LABEL: define i32 @goo._Msve( +define i32 @caller3._Msve() #1 { +; CHECK-LABEL: define i32 @caller3._Msve( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() ; @@ -196,8 +188,8 @@ entry: ret i32 %call } -define i32 @goo.default() #0 { -; CHECK-LABEL: define i32 @goo.default( +define i32 @caller3.default() #0 { +; CHECK-LABEL: define i32 @caller3.default( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() ; @@ -207,7 +199,6 @@ entry: } declare i32 @test_non_fmv_caller._Maes() #6 - declare i32 @test_non_fmv_caller.default() #0 define weak_odr ptr @test_non_fmv_caller.resolver() comdat { @@ -221,8 +212,8 @@ resolver_entry: ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default } -define i32 @baz() #7 { -; CHECK-LABEL: define i32 @baz( +define i32 @caller4() #7 { +; CHECK-LABEL: define i32 @caller4( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() ; @@ -232,11 +223,8 @@ entry: } declare i32 @test_priority._Msve2-sha3() #8 - declare i32 @test_priority._Mls64Mssbs() #9 - declare i32 @test_priority._MflagmMlseMrng() #10 - declare i32 @test_priority.default() #0 define weak_odr ptr @test_priority.resolver() comdat { @@ -264,8 +252,8 @@ resolver_else2: ; preds = %resolver_else br label %common.ret } -define i32 @hoo._MflagmMls64MlseMrngMssbsMsve2-sha3() #11 { -; CHECK-LABEL: define i32 @hoo._MflagmMls64MlseMrngMssbsMsve2-sha3( +define i32 @caller5._MflagmMls64MlseMrngMssbsMsve2-sha3() #11 { +; CHECK-LABEL: define i32 @caller5._MflagmMls64MlseMrngMssbsMsve2-sha3( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR11:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() ; From bead4dcff13c960446956b037521221e7943c409 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 9 Jan 2025 13:47:33 +0000 Subject: [PATCH 08/10] Add a test for parsing fmv features whose backend feature has alternative name. --- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 78 ++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index bbfbedb2243dc4..50fec7b04b9aba 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority)" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4 ; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -9,6 +9,7 @@ $test_multi_bb_resolver.resolver = comdat any $test_caller_feats_not_implied.resolver = comdat any $test_non_fmv_caller.resolver = comdat any $test_priority.resolver = comdat any +$test_alternative_names.resolver = comdat any @__aarch64_cpu_features = external local_unnamed_addr global { i64 } @@ -17,6 +18,7 @@ $test_priority.resolver = comdat any @test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver @test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver @test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver +@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver declare void @__init_cpu_features_resolver() local_unnamed_addr @@ -262,6 +264,76 @@ entry: ret i32 %call } +declare i32 @test_alternative_names._Mdpb2Mfrintts() #12 +declare i32 @test_alternative_names._Mflagm2Mfrintts() #13 +declare i32 @test_alternative_names._Mrcpc2() #14 +declare i32 @test_alternative_names.default() #0 + +define weak_odr ptr @test_alternative_names.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 17563904 + %2 = icmp eq i64 %1, 17563904 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 16777478 + %4 = icmp eq i64 %3, 16777478 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 12582912 + %6 = icmp eq i64 %5, 12582912 + %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default + br label %common.ret +} + +define i32 @caller6._Mdpb2Mfrintts() #12 { +; CHECK-LABEL: define i32 @caller6._Mdpb2Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller6._Mfrintts() #15 { +; CHECK-LABEL: define i32 @caller6._Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller6._Mrcpc2() #14 { +; CHECK-LABEL: define i32 @caller6._Mrcpc2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR14:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller6.default() #0 { +; CHECK-LABEL: define i32 @caller6.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + attributes #0 = { "fmv-features" } attributes #1 = { "fmv-features"="sve" } attributes #2 = { "fmv-features"="sve2" } @@ -274,3 +346,7 @@ attributes #8 = { "fmv-features"="sve2-sha3" } attributes #9 = { "fmv-features"="ls64,ssbs" } attributes #10 = { "fmv-features"="flagm,lse,rng" } attributes #11 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } +attributes #12 = { "fmv-features"="dpb2,frintts" } +attributes #13 = { "fmv-features"="flagm2,frintts" } +attributes #14 = { "fmv-features"="rcpc2" } +attributes #15 = { "fmv-features"="frintts" } From b9896b72c963bd0263fec90640129aa563ff2314 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 9 Jan 2025 14:22:23 +0000 Subject: [PATCH 09/10] Early exit if collectVersions fails to walk the use-def chain at any point. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 42 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 6ad67e203db9c5..29b421873a3ee4 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2642,17 +2642,28 @@ DeleteDeadIFuncs(Module &M, } // Follows the use-def chain of \p V backwards until it finds a Function, -// in which case it collects in \p Versions. -static void collectVersions(Value *V, SmallVectorImpl &Versions) { +// in which case it collects in \p Versions. Return true on successful +// use-def chain traversal, false otherwise. +static bool collectVersions(TargetTransformInfo &TTI, Value *V, + SmallVectorImpl &Versions) { if (auto *F = dyn_cast(V)) { + if (!TTI.isMultiversionedFunction(*F)) + return false; Versions.push_back(F); } else if (auto *Sel = dyn_cast(V)) { - collectVersions(Sel->getTrueValue(), Versions); - collectVersions(Sel->getFalseValue(), Versions); + if (!collectVersions(TTI, Sel->getTrueValue(), Versions)) + return false; + if (!collectVersions(TTI, Sel->getFalseValue(), Versions)) + return false; } else if (auto *Phi = dyn_cast(V)) { for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) - collectVersions(Phi->getIncomingValue(I), Versions); + if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions)) + return false; + } else { + // Unknown instruction type. Bail. + return false; } + return true; } // Bypass the IFunc Resolver of MultiVersioned functions when possible. To @@ -2690,23 +2701,20 @@ static bool OptimizeNonTrivialIFuncs( if (Resolver->isInterposable()) continue; - // Discover the callee versions. - SmallVector Callees; - for (BasicBlock &BB : *Resolver) - if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) - collectVersions(Ret->getReturnValue(), Callees); - - if (Callees.empty()) - continue; - TargetTransformInfo &TTI = GetTTI(*Resolver); - // This IFunc is not FMV. - if (any_of(Callees, [&TTI](Function *F) { - return !TTI.isMultiversionedFunction(*F); + // Discover the callee versions. + SmallVector Callees; + if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) { + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + if (!collectVersions(TTI, Ret->getReturnValue(), Callees)) + return true; + return false; })) continue; + assert(!Callees.empty() && "Expecting successful collection of versions"); + // Cache the feature mask for each callee. for (Function *Callee : Callees) { auto [It, Inserted] = FeatureMask.try_emplace(Callee); From e82feccf717ce40a0750c4ae028d9c95df03877a Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Fri, 10 Jan 2025 13:31:45 +0000 Subject: [PATCH 10/10] Only consider the highest priority callee when the caller is non-FMV. Add a corresponding test case. --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 38 ++++++---- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 73 +++++++++++-------- 2 files changed, 68 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 29b421873a3ee4..bf0cacc6224be8 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2761,20 +2761,32 @@ static bool OptimizeNonTrivialIFuncs( Function *Callee = Callees[I]; uint64_t CallerBits = FeatureMask[Caller]; uint64_t CalleeBits = FeatureMask[Callee]; - // If the feature set of the caller implies the feature set of the - // highest priority candidate then it shall be picked. In case of - // identical sets advance the candidate index one position. - if (CallerBits == CalleeBits) - ++I; - else if (!implies(CallerBits, CalleeBits)) { - // Keep advancing the candidate index as long as the caller's - // features are a subset of the current candidate's. - while (implies(CalleeBits, CallerBits)) { - if (++I == Callees.size()) - break; - CalleeBits = FeatureMask[Callees[I]]; + + // In the case of FMV callers, we know that all higher priority callers + // than the current one did not get selected at runtime, which helps + // reason about the callees (if they have versions that mandate presence + // of the features which we already know are unavailable on this target). + if (TTI.isMultiversionedFunction(*Caller)) { + // If the feature set of the caller implies the feature set of the + // highest priority candidate then it shall be picked. In case of + // identical sets advance the candidate index one position. + if (CallerBits == CalleeBits) + ++I; + else if (!implies(CallerBits, CalleeBits)) { + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + while (implies(CalleeBits, CallerBits)) { + if (++I == Callees.size()) + break; + CalleeBits = FeatureMask[Callees[I]]; + } + continue; } - continue; + } else { + // We can't reason much about non-FMV callers. Just pick the highest + // priority callee if it matches, otherwise bail. + if (I > 0 || !implies(CallerBits, CalleeBits)) + continue; } auto &Calls = CallSites[Caller]; for (CallBase *CS : Calls) diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll index 50fec7b04b9aba..90bd98a9b0d381 100644 --- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -201,6 +201,7 @@ entry: } declare i32 @test_non_fmv_caller._Maes() #6 +declare i32 @test_non_fmv_caller._Msm4() #7 declare i32 @test_non_fmv_caller.default() #0 define weak_odr ptr @test_non_fmv_caller.resolver() comdat { @@ -214,7 +215,7 @@ resolver_entry: ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default } -define i32 @caller4() #7 { +define i32 @caller4() #8 { ; CHECK-LABEL: define i32 @caller4( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() @@ -224,9 +225,19 @@ entry: ret i32 %call } -declare i32 @test_priority._Msve2-sha3() #8 -declare i32 @test_priority._Mls64Mssbs() #9 -declare i32 @test_priority._MflagmMlseMrng() #10 +define i32 @caller5() #9 { +; CHECK-LABEL: define i32 @caller5( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +declare i32 @test_priority._Msve2-sha3() #10 +declare i32 @test_priority._Mls64Mssbs() #11 +declare i32 @test_priority._MflagmMlseMrng() #12 declare i32 @test_priority.default() #0 define weak_odr ptr @test_priority.resolver() comdat { @@ -254,9 +265,9 @@ resolver_else2: ; preds = %resolver_else br label %common.ret } -define i32 @caller5._MflagmMls64MlseMrngMssbsMsve2-sha3() #11 { -; CHECK-LABEL: define i32 @caller5._MflagmMls64MlseMrngMssbsMsve2-sha3( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR11:[0-9]+]] { +define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 { +; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() ; entry: @@ -264,9 +275,9 @@ entry: ret i32 %call } -declare i32 @test_alternative_names._Mdpb2Mfrintts() #12 -declare i32 @test_alternative_names._Mflagm2Mfrintts() #13 -declare i32 @test_alternative_names._Mrcpc2() #14 +declare i32 @test_alternative_names._Mdpb2Mfrintts() #14 +declare i32 @test_alternative_names._Mflagm2Mfrintts() #15 +declare i32 @test_alternative_names._Mrcpc2() #16 declare i32 @test_alternative_names.default() #0 define weak_odr ptr @test_alternative_names.resolver() comdat { @@ -294,9 +305,9 @@ resolver_else2: ; preds = %resolver_else br label %common.ret } -define i32 @caller6._Mdpb2Mfrintts() #12 { -; CHECK-LABEL: define i32 @caller6._Mdpb2Mfrintts( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { +define i32 @caller7._Mdpb2Mfrintts() #14 { +; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts() ; entry: @@ -304,9 +315,9 @@ entry: ret i32 %call } -define i32 @caller6._Mfrintts() #15 { -; CHECK-LABEL: define i32 @caller6._Mfrintts( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { +define i32 @caller7._Mfrintts() #17 { +; CHECK-LABEL: define i32 @caller7._Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names() ; entry: @@ -314,9 +325,9 @@ entry: ret i32 %call } -define i32 @caller6._Mrcpc2() #14 { -; CHECK-LABEL: define i32 @caller6._Mrcpc2( -; CHECK-SAME: ) local_unnamed_addr #[[ATTR14:[0-9]+]] { +define i32 @caller7._Mrcpc2() #16 { +; CHECK-LABEL: define i32 @caller7._Mrcpc2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2() ; entry: @@ -324,8 +335,8 @@ entry: ret i32 %call } -define i32 @caller6.default() #0 { -; CHECK-LABEL: define i32 @caller6.default( +define i32 @caller7.default() #0 { +; CHECK-LABEL: define i32 @caller7.default( ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { ; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default() ; @@ -341,12 +352,14 @@ attributes #3 = { "fmv-features"="mops" } attributes #4 = { "fmv-features"="mops,sve2" } attributes #5 = { "fmv-features"="sme" } attributes #6 = { "fmv-features"="aes" } -attributes #7 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } -attributes #8 = { "fmv-features"="sve2-sha3" } -attributes #9 = { "fmv-features"="ls64,ssbs" } -attributes #10 = { "fmv-features"="flagm,lse,rng" } -attributes #11 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } -attributes #12 = { "fmv-features"="dpb2,frintts" } -attributes #13 = { "fmv-features"="flagm2,frintts" } -attributes #14 = { "fmv-features"="rcpc2" } -attributes #15 = { "fmv-features"="frintts" } +attributes #7 = { "fmv-features"="sm4" } +attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" } +attributes #10 = { "fmv-features"="sve2-sha3" } +attributes #11 = { "fmv-features"="ls64,ssbs" } +attributes #12 = { "fmv-features"="flagm,lse,rng" } +attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } +attributes #14 = { "fmv-features"="dpb2,frintts" } +attributes #15 = { "fmv-features"="flagm2,frintts" } +attributes #16 = { "fmv-features"="rcpc2" } +attributes #17 = { "fmv-features"="frintts" }