diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fa9392b86c15b9..7b7e159a84b7a7 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1762,6 +1762,12 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns true if the target supports Function MultiVersioning. + bool hasFMV() const; + + /// Returns the MultiVersion priority of a given function. + uint64_t getFMVPriority(Function &F) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2152,6 +2158,8 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual bool hasFMV() const = 0; + virtual uint64_t getFMVPriority(Function &F) const = 0; virtual unsigned getMaxNumArgs() const = 0; }; @@ -2904,6 +2912,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + bool hasFMV() const override { return Impl.hasFMV(); } + + uint64_t getFMVPriority(Function &F) const override { + return Impl.getFMVPriority(F); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 63c2ef8912b29c..c7c9233f92da9e 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -941,6 +941,10 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + bool hasFMV() const { return false; } + + uint64_t getFMVPriority(Function &F) const { return 0; } + unsigned getMaxNumArgs() const { return UINT_MAX; } protected: diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 805b963a7a13c7..152cfee8cf373d 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -846,6 +846,7 @@ const ArchInfo *getArchForCpu(StringRef CPU); // Parser const ArchInfo *parseArch(StringRef Arch); std::optional parseArchExtension(StringRef Extension); +std::optional parseTargetFeature(StringRef Feature); // Given the name of a CPU or alias, return the correponding CpuInfo. std::optional parseCpu(StringRef Name); // Used by target parser tests @@ -856,7 +857,8 @@ bool isX18ReservedByDefault(const Triple &TT); // For given feature names, return a bitmask corresponding to the entries of // AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks // themselves, they are sequential (0, 1, 2, 3, ...). -uint64_t getCpuSupportsMask(ArrayRef FeatureStrs); +uint64_t getCpuSupportsMask(ArrayRef FeatureStrs, + bool IsBackEndFeature = false); void PrintSupportedExtensions(StringMap DescMap); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5f933b4587843c..a87be98113b6e0 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1296,6 +1296,12 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +bool TargetTransformInfo::hasFMV() const { return TTIImpl->hasFMV(); } + +uint64_t TargetTransformInfo::getFMVPriority(Function &F) const { + return TTIImpl->getFMVPriority(F); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ee7137b92445bb..7eb3243cc5aa48 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -231,6 +232,13 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFMVPriority(Function &F) const { + StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getCpuSupportsMask(Features, /*IsBackEndFeature = */ true); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index de39dea2be43e1..043e1716958485 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -83,6 +83,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + bool hasFMV() const { return ST->hasFMV(); } + + uint64_t getFMVPriority(Function &F) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 71099462d5ecff..5eecde791a0336 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -47,12 +47,13 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA return {}; } -uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs) { +uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs, + bool IsBackEndFeature) { uint64_t FeaturesMask = 0; - for (const StringRef &FeatureStr : FeatureStrs) { - if (auto Ext = parseArchExtension(FeatureStr)) + for (const StringRef FeatureStr : FeatureStrs) + if (auto Ext = IsBackEndFeature ? parseTargetFeature(FeatureStr) + : parseArchExtension(FeatureStr)) FeaturesMask |= (1ULL << Ext->CPUFeature); - } return FeaturesMask; } @@ -132,6 +133,14 @@ std::optional AArch64::parseArchExtension(StringRef Arch return {}; } +std::optional +AArch64::parseTargetFeature(StringRef Feature) { + for (const auto &E : Extensions) + if (Feature == E.Feature) + return E; + return {}; +} + std::optional AArch64::parseCpu(StringRef Name) { // Resolve aliases first. Name = resolveCPUAlias(Name); diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index da714c9a75701b..9b2aeed634aac6 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -89,7 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); STATISTIC(NumInternalFunc, "Number of internal functions"); STATISTIC(NumColdCC, "Number of functions marked coldcc"); -STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs"); +STATISTIC(NumIFuncsResolved, "Number of resolved IFuncs"); STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed"); static cl::opt @@ -2462,6 +2462,140 @@ DeleteDeadIFuncs(Module &M, return Changed; } +// Follows the use-def chain of \p V backwards until it finds a Function, +// in which case it collects in \p Versions. +static void collectVersions(Value *V, SmallVectorImpl &Versions) { + if (auto *F = dyn_cast(V)) { + Versions.push_back(F); + } else if (auto *Sel = dyn_cast(V)) { + collectVersions(Sel->getTrueValue(), Versions); + collectVersions(Sel->getFalseValue(), Versions); + } else if (auto *Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + collectVersions(Phi->getIncomingValue(I), Versions); + } +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked from a +// higher priority caller (directly or through the resolver). +// +// * For every higher priority callee version than the best match, there +// is a higher priority caller version whose feature set availability +// is implied by the callee's feature set. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeaturePriorityMap; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + if (!TTI.hasFMV()) + return false; + + // Discover the callee versions. + SmallVector Callees; + for (BasicBlock &BB : *Resolver) + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + collectVersions(Ret->getReturnValue(), Callees); + + if (Callees.empty()) + continue; + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeaturePriorityMap.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFMVPriority(*Callee); + } + + // Sort the callee versions in decreasing priority order. + sort(Callees, [&](auto *LHS, auto *RHS) { + return FeaturePriorityMap[LHS] > FeaturePriorityMap[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector Callers; + DenseMap> CallSiteMap; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [FeatIt, FeatInserted] = FeaturePriorityMap.try_emplace(Caller); + if (FeatInserted) + FeatIt->second = TTI.getFMVPriority(*Caller); + auto [CallIt, CallInserted] = CallSiteMap.try_emplace(Caller); + if (CallInserted) + Callers.push_back(Caller); + CallIt->second.push_back(CB); + } + } + } + + // Sort the caller versions in decreasing priority order. + sort(Callers, [&](auto *LHS, auto *RHS) { + return FeaturePriorityMap[LHS] > FeaturePriorityMap[RHS]; + }); + + // Index to the highest priority candidate. + unsigned I = 0; + // Now try to redirect calls starting from higher priority callers. + for (Function *Caller : Callers) { + // Getting here means we found callers of equal priority. + if (I == Callees.size()) + break; + Function *Callee = Callees[I]; + uint64_t CallerPriority = FeaturePriorityMap[Caller]; + uint64_t CalleePriority = FeaturePriorityMap[Callee]; + // If the priority of the caller is greater or equal to the highest + // priority candidate then it shall be picked. In case of equality + // advance the candidate index one position. + if (CallerPriority == CalleePriority) + ++I; + else if (CallerPriority < CalleePriority) { + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + while ((CallerPriority & CalleePriority) == CallerPriority) { + if (++I == Callees.size()) + break; + CalleePriority = FeaturePriorityMap[Callees[I]]; + } + continue; + } + auto &CallSites = CallSiteMap[Caller]; + for (CallBase *CS : CallSites) + CS->setCalledOperand(Callee); + Changed = true; + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2525,6 +2659,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 00000000000000..21e0b7780865b9 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,398 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied)" --version 4 +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$test_caller_feats_not_implied.resolver = comdat any +$test_same_priority_callers.resolver = comdat any +$foo.resolver = comdat any +$bar.resolver = comdat any +$goo.resolver = comdat any +$baz.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_single_bb_resolver +@test_multi_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_multi_bb_resolver +@test_caller_feats_not_implied.ifunc = weak_odr alias i32 (), ptr @test_caller_feats_not_implied +@test_same_priority_callers.ifunc = weak_odr alias i32 (), ptr @test_same_priority_callers +@foo.ifunc = weak_odr alias i32 (), ptr @foo +@bar.ifunc = weak_odr alias i32 (), ptr @bar +@goo.ifunc = weak_odr alias i32 (), ptr @goo +@baz.ifunc = weak_odr alias i32 (), ptr @baz + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver +@test_same_priority_callers = weak_odr ifunc i32 (), ptr @test_same_priority_callers.resolver +@foo = weak_odr ifunc i32 (), ptr @foo.resolver +@bar = weak_odr ifunc i32 (), ptr @bar.resolver +@goo = weak_odr ifunc i32 (), ptr @goo.resolver +@baz = weak_odr ifunc i32 (), ptr @baz.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver._Msve() #2 + +declare i32 @test_single_bb_resolver._Msve2() #3 + +define i32 @test_single_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_single_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_single_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @foo._Msve() #2 { +; CHECK-LABEL: define i32 @foo._Msve( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @foo._Msve2() #3 { +; CHECK-LABEL: define i32 @foo._Msve2( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL1:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; CHECK: [[CALL2:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call1 = tail call i32 @test_single_bb_resolver() + %call2 = tail call i32 @test_single_bb_resolver() + %added = add nsw i32 %call1, %call2 + %add = add nsw i32 %added, 20 + ret i32 %add +} + +define i32 @foo.default() #1 { +; CHECK-LABEL: define i32 @foo.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @foo.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @foo.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve + %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2 + ret ptr %common.ret.op +} + +define i32 @test_multi_bb_resolver._Mmops() #4 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Mmops( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +entry: + ret i32 3 +} + +define i32 @test_multi_bb_resolver._Msve2() #3 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve2( +; CHECK-SAME: ) #[[ATTR1]] { +entry: + ret i32 2 +} + +define i32 @test_multi_bb_resolver._Msve() #2 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_multi_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_multi_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @bar._MmopsMsve2() #5 { +; CHECK-LABEL: define i32 @bar._MmopsMsve2( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 40 + ret i32 %add +} + +define i32 @bar._Mmops() #4 { +; CHECK-LABEL: define i32 @bar._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @bar._Msve() #2 { +; CHECK-LABEL: define i32 @bar._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 20 + ret i32 %add +} + +define i32 @bar.default() #1 { +; CHECK-LABEL: define i32 @bar.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @bar.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @bar.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460821022900224 + %2 = icmp eq i64 %1, 576460821022900224 + %3 = and i64 %0, 1073741824 + %.not = icmp eq i64 %3, 0 + %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve + %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default + ret ptr %common.ret.op +} + +define i32 @test_caller_feats_not_implied._Mmops() #4 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Mmops( +; CHECK-SAME: ) #[[ATTR4]] { +entry: + ret i32 3 +} + +define i32 @test_caller_feats_not_implied._Msme() #6 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msme( +; CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +entry: + ret i32 2 +} + +define i32 @test_caller_feats_not_implied._Msve() #2 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_caller_feats_not_implied.default() #1 { +; CHECK-LABEL: define i32 @test_caller_feats_not_implied.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_caller_feats_not_implied.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 4398046511104 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve + br label %common.ret +} + +define i32 @goo._Mmops() #4 { +; CHECK-LABEL: define i32 @goo._Mmops( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @goo._Msve() #2 { +; CHECK-LABEL: define i32 @goo._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @goo.default() #1 { +; CHECK-LABEL: define i32 @goo.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define weak_odr ptr @goo.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @goo.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %goo._Msve.goo.default = select i1 %.not3, ptr @goo.default, ptr @goo._Msve + %common.ret.op = select i1 %.not, ptr %goo._Msve.goo.default, ptr @goo._Mmops + ret ptr %common.ret.op +} + +define i32 @test_same_priority_callers._Msve() #2 { +; CHECK-LABEL: define i32 @test_same_priority_callers._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_same_priority_callers.default() #1 { +; CHECK-LABEL: define i32 @test_same_priority_callers.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_same_priority_callers.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_same_priority_callers.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 1073741824 + %.not = icmp eq i64 %1, 0 + %test_same_priority_callers._Msve.test_same_priority_callers.default = select i1 %.not, ptr @test_same_priority_callers.default, ptr @test_same_priority_callers._Msve + ret ptr %test_same_priority_callers._Msve.test_same_priority_callers.default +} + +define dso_local i32 @baz._Msve() #2 { +; CHECK-LABEL: define dso_local i32 @baz._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + %call = tail call i32 @test_same_priority_callers() + ret i32 %call +} + +define i32 @baz._Maes() #1 { +; CHECK-LABEL: define i32 @baz._Maes( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + %call = tail call i32 @test_same_priority_callers() + ret i32 %call +} + +define dso_local i32 @baz.default() #1 { +; CHECK-LABEL: define dso_local i32 @baz.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + %call = tail call i32 @test_same_priority_callers() + ret i32 %call +} + +define weak_odr ptr @baz.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @baz.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 1073741824 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 16384 + %.not3 = icmp eq i64 %2, 0 + %baz._Maes.baz.default = select i1 %.not3, ptr @baz.default, ptr @baz._Maes + %common.ret.op = select i1 %.not, ptr %baz._Maes.baz.default, ptr @baz._Msve + ret ptr %common.ret.op +} + +attributes #0 = { "target-features"="+fmv" } +attributes #1 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } +attributes #3 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } +attributes #4 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" } +attributes #5 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" } +attributes #6 = { "target-features"="+bf16,+fp-armv8,+neon,+outline-atomics,+sme,+v8a" }