From 54ffdf4e047addb55eae1573d4bfb3f30a909ac9 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Sun, 7 Apr 2024 21:06:47 +0100 Subject: [PATCH] [FMV][GlobalOpt] Bypass the IFunc Resolver of MultiVersioned functions. To deduce whether the optimization is legal we need to compare the target features between caller and callee versions. The criteria for bypassing the resolver are the following: * If the callee's feature set is a subset of the caller's feature set, then the callee is a candidate for direct call. * Among such candidates the one of highest priority is the best match and it shall be picked, unless there is a version of the callee with higher priority than the best match which cannot be picked because there is no corresponding caller for whom it would have been the best match. Implementation details: First we collect all the callee versions in feature priority order. We do the same for all the callsites. Then we try to constant fold the resolver for every callsite starting from higher priority callers. This guarantees that as soon as we find a callee whose priority is lower than the expected best match then there is no point in continuing further. The constant folding works for single basic block resolvers as well as for resolvers consisting of multiple basic blocks. The set of instructions we attempt to fold are a handful give or take (return, binop, compare, select, branch, phi) and we only follow single user use-def chains. For callsites residing in the same caller we cache the folded result to avoid redundant computation. --- .../llvm/Analysis/TargetTransformInfo.h | 23 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 6 + llvm/lib/Analysis/TargetTransformInfo.cpp | 10 + .../AArch64/AArch64TargetTransformInfo.cpp | 12 + .../AArch64/AArch64TargetTransformInfo.h | 6 + llvm/lib/TargetParser/AArch64TargetParser.cpp | 9 +- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 227 +++++++++++++++++- .../Transforms/GlobalOpt/resolve-fmv-ifunc.ll | 211 ++++++++++++++++ 8 files changed, 501 insertions(+), 3 deletions(-) create mode 100644 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fa9392b86c15b9..530935fd63d326 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1762,6 +1762,16 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns true if the target supports Function MultiVersioning. + bool hasFMV() const; + + /// Returns the MultiVersion priority of a given function. + uint64_t getFMVPriority(Function &F) const; + + /// Returns the symbol which contains the cpu feature mask used by + /// the Function MultiVersioning resolver. + GlobalVariable *getCPUFeatures(Module &M) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2152,6 +2162,9 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual bool hasFMV() const = 0; + virtual uint64_t getFMVPriority(Function &F) const = 0; + virtual GlobalVariable *getCPUFeatures(Module &M) const = 0; virtual unsigned getMaxNumArgs() const = 0; }; @@ -2904,6 +2917,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + bool hasFMV() const override { return Impl.hasFMV(); } + + uint64_t getFMVPriority(Function &F) const override { + return Impl.getFMVPriority(F); + } + + GlobalVariable *getCPUFeatures(Module &M) const override { + return Impl.getCPUFeatures(M); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 63c2ef8912b29c..746c09f0d50370 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -941,6 +941,12 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + bool hasFMV() const { return false; } + + uint64_t getFMVPriority(Function &F) const { return 0; } + + GlobalVariable *getCPUFeatures(Module &M) const { return nullptr; } + unsigned getMaxNumArgs() const { return UINT_MAX; } protected: diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5f933b4587843c..39da6cc4445759 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1296,6 +1296,16 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +bool TargetTransformInfo::hasFMV() const { return TTIImpl->hasFMV(); } + +uint64_t TargetTransformInfo::getFMVPriority(Function &F) const { + return TTIImpl->getFMVPriority(F); +} + +GlobalVariable *TargetTransformInfo::getCPUFeatures(Module &M) const { + return TTIImpl->getCPUFeatures(M); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ee7137b92445bb..a92f859b59a3de 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -231,6 +232,17 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFMVPriority(Function &F) const { + StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getCpuSupportsMask(Features); +} + +GlobalVariable *AArch64TTIImpl::getCPUFeatures(Module &M) const { + return M.getGlobalVariable("__aarch64_cpu_features"); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index de39dea2be43e1..51ad79690679f5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -83,6 +83,12 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + bool hasFMV() const { return ST->hasFMV(); } + + uint64_t getFMVPriority(Function &F) const; + + GlobalVariable *getCPUFeatures(Module &M) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 71099462d5ecff..7a3d2fc5f0c9db 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -50,8 +50,13 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs) { uint64_t FeaturesMask = 0; for (const StringRef &FeatureStr : FeatureStrs) { - if (auto Ext = parseArchExtension(FeatureStr)) - FeaturesMask |= (1ULL << Ext->CPUFeature); + StringRef Feat = resolveExtAlias(FeatureStr); + for (const auto &E : Extensions) { + if (Feat == E.Name || Feat == E.Feature) { + FeaturesMask |= (1ULL << E.CPUFeature); + break; + } + } } return FeaturesMask; } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index da714c9a75701b..f41905c4e77c41 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -89,7 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); STATISTIC(NumInternalFunc, "Number of internal functions"); STATISTIC(NumColdCC, "Number of functions marked coldcc"); -STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs"); +STATISTIC(NumIFuncsResolved, "Number of resolved IFuncs"); STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed"); static cl::opt @@ -2462,6 +2462,228 @@ DeleteDeadIFuncs(Module &M, return Changed; } +static Function *foldResolverForCallSite(CallBase *CS, uint64_t Priority, + TargetTransformInfo &TTI) { + // Look for the instruction which feeds the feature mask to the users. + auto findRoot = [&TTI](Function *F) -> Instruction * { + for (Instruction &I : F->getEntryBlock()) + if (auto *Load = dyn_cast(&I)) + if (Load->getPointerOperand() == TTI.getCPUFeatures(*F->getParent())) + return Load; + return nullptr; + }; + + auto *IF = cast(CS->getCalledOperand()); + Instruction *Root = findRoot(IF->getResolverFunction()); + // There is no such instruction. Bail. + if (!Root) + return nullptr; + + // Create a constant mask to use as seed for the constant propagation. + Constant *Seed = Constant::getIntegerValue( + Root->getType(), APInt(Root->getType()->getIntegerBitWidth(), Priority)); + + auto DL = CS->getModule()->getDataLayout(); + + // Recursively propagate on single use chains. + std::function + constFoldInst = [&](Instruction *I, Instruction *Use, Constant *C, + BasicBlock *Pred) -> Constant * { + // Base case. + if (auto *Ret = dyn_cast(I)) + if (Ret->getReturnValue() == Use) + return C; + + // Minimal set of instruction types to handle. + if (auto *BinOp = dyn_cast(I)) { + bool Swap = BinOp->getOperand(1) == Use; + if (auto *Other = dyn_cast(BinOp->getOperand(Swap ? 0 : 1))) + C = Swap ? ConstantFoldBinaryInstruction(BinOp->getOpcode(), Other, C) + : ConstantFoldBinaryInstruction(BinOp->getOpcode(), C, Other); + } else if (auto *Cmp = dyn_cast(I)) { + bool Swap = Cmp->getOperand(1) == Use; + if (auto *Other = dyn_cast(Cmp->getOperand(Swap ? 0 : 1))) + C = Swap ? ConstantFoldCompareInstOperands(Cmp->getPredicate(), Other, + C, DL) + : ConstantFoldCompareInstOperands(Cmp->getPredicate(), C, + Other, DL); + } else if (auto *Sel = dyn_cast(I)) { + if (Sel->getCondition() == Use) + C = dyn_cast(C->isZeroValue() ? Sel->getFalseValue() + : Sel->getTrueValue()); + } else if (auto *Phi = dyn_cast(I)) { + if (Pred) + C = dyn_cast(Phi->getIncomingValueForBlock(Pred)); + } else if (auto *Br = dyn_cast(I)) { + if (Br->getCondition() == Use) { + BasicBlock *BB = Br->getSuccessor(C->isZeroValue()); + return constFoldInst(&BB->front(), Root, Seed, Br->getParent()); + } + } else { + // Don't know how to handle. Bail. + return nullptr; + } + + // Folding succeeded. Continue. + if (C && I->hasOneUse()) + if (auto *UI = dyn_cast(I->user_back())) + return constFoldInst(UI, I, C, nullptr); + + return nullptr; + }; + + // Collect all users in the entry block ordered by proximity. The rest of + // them can be discovered later. Unfortunately we cannot simply traverse + // the Root's 'users()' as their order is not the same as execution order. + unsigned NUsersLeft = std::distance(Root->user_begin(), Root->user_end()); + SmallVector Users; + for (Instruction &I : *Root->getParent()) { + if (any_of(I.operands(), [Root](auto &Op) { return Op == Root; })) { + Users.push_back(&I); + if (--NUsersLeft == 0) + break; + } + } + + // Return as soon as we find a foldable user. It has the highest priority. + for (Instruction *I : Users) { + Constant *C = constFoldInst(I, Root, Seed, nullptr); + if (C) + return cast(C); + } + + return nullptr; +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked because +// there is no corresponding caller for whom it would have been the best +// match. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + std::function &)> visitValue = + [&](Value *V, SmallVectorImpl &FuncVersions) { + if (auto *Func = dyn_cast(V)) { + FuncVersions.push_back(Func); + } else if (auto *Sel = dyn_cast(V)) { + visitValue(Sel->getTrueValue(), FuncVersions); + visitValue(Sel->getFalseValue(), FuncVersions); + } else if (auto *Phi = dyn_cast(V)) + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + visitValue(Phi->getIncomingValue(I), FuncVersions); + }; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeaturePriorityMap; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + if (!TTI.hasFMV()) + return false; + + // Discover the callee versions. + SmallVector Callees; + for (BasicBlock &BB : *Resolver) + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + visitValue(Ret->getReturnValue(), Callees); + + if (Callees.empty()) + continue; + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeaturePriorityMap.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFMVPriority(*Callee); + } + + // Sort the callee versions in increasing feature priority order. + // Every time we find a caller that matches the highest priority + // callee we pop_back() one from this ordered list. + llvm::stable_sort(Callees, [&](auto *LHS, auto *RHS) { + return FeaturePriorityMap[LHS] < FeaturePriorityMap[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector CallSites; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [It, Inserted] = FeaturePriorityMap.try_emplace(Caller); + if (Inserted) + It->second = TTI.getFMVPriority(*Caller); + CallSites.push_back(CB); + } + } + } + + // Sort the callsites in decreasing feature priority order. + llvm::stable_sort(CallSites, [&](auto *LHS, auto *RHS) { + return FeaturePriorityMap[LHS->getFunction()] > + FeaturePriorityMap[RHS->getFunction()]; + }); + + // Now try to constant fold the resolver for every callsite starting + // from higher priority callers. This guarantees that as soon as we + // find a callee whose priority is lower than the expected best match + // then there is no point in continuing further. + DenseMap foldedResolverCache; + for (CallBase *CS : CallSites) { + uint64_t CallerPriority = FeaturePriorityMap[CS->getFunction()]; + auto [It, Inserted] = foldedResolverCache.try_emplace(CallerPriority); + Function *&Callee = It->second; + if (Inserted) + Callee = foldResolverForCallSite(CS, CallerPriority, TTI); + if (Callee) { + if (!Callees.empty()) { + // If the priority of the candidate is greater or equal to + // the expected best match then it shall be picked. Otherwise + // there is a higher priority callee without a corresponding + // caller, in which case abort. + uint64_t CalleePriority = FeaturePriorityMap[Callee]; + if (CalleePriority == FeaturePriorityMap[Callees.back()]) + Callees.pop_back(); + else if (CalleePriority < FeaturePriorityMap[Callees.back()]) + break; + } + CS->setCalledOperand(Callee); + Changed = true; + } else { + // Oops, something went wrong. We couldn't fold. Abort. + break; + } + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2525,6 +2747,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 00000000000000..bcc73c8e44970f --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,211 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver)" --version 4 +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$foo.resolver = comdat any +$bar.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_single_bb_resolver +@test_multi_bb_resolver.ifunc = weak_odr dso_local alias i32 (), ptr @test_multi_bb_resolver +@foo.ifunc = weak_odr alias i32 (), ptr @foo +@bar.ifunc = weak_odr dso_local alias i32 (), ptr @bar + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr dso_local ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@foo = weak_odr ifunc i32 (), ptr @foo.resolver +@bar = weak_odr dso_local ifunc i32 (), ptr @bar.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver._Msve() #2 + +declare i32 @test_single_bb_resolver._Msve2() #3 + +define i32 @test_single_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_single_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_single_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @foo._Msve() #2 { +; CHECK-LABEL: define i32 @foo._Msve( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @foo._Msve2() #3 { +; CHECK-LABEL: define i32 @foo._Msve2( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL1:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; CHECK: [[CALL2:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call1 = tail call i32 @test_single_bb_resolver() + %call2 = tail call i32 @test_single_bb_resolver() + %added = add nsw i32 %call1, %call2 + %add = add nsw i32 %added, 20 + ret i32 %add +} + +define i32 @foo.default() #1 { +; CHECK-LABEL: define i32 @foo.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @foo.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @foo.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve + %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2 + ret ptr %common.ret.op +} + +define i32 @test_multi_bb_resolver._Mmops() #4 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Mmops( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +entry: + ret i32 3 +} + +define i32 @test_multi_bb_resolver._Msve2() #3 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve2( +; CHECK-SAME: ) #[[ATTR1]] { +entry: + ret i32 2 +} + +define i32 @test_multi_bb_resolver._Msve() #2 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_multi_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_multi_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @bar._MmopsMsve2() #5 { +; CHECK-LABEL: define i32 @bar._MmopsMsve2( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @bar._Msve() #2 { +; CHECK-LABEL: define i32 @bar._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 20 + ret i32 %add +} + +define i32 @bar.default() #1 { +; CHECK-LABEL: define i32 @bar.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @bar.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @bar.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460821022900224 + %2 = icmp eq i64 %1, 576460821022900224 + %3 = and i64 %0, 1073741824 + %.not = icmp eq i64 %3, 0 + %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve + %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default + ret ptr %common.ret.op +} + +attributes #0 = { "target-features"="+fmv" } +attributes #1 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } +attributes #3 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } +attributes #4 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" } +attributes #5 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" } +