diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fa9392b86c15b9..530935fd63d326 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1762,6 +1762,16 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns true if the target supports Function MultiVersioning. + bool hasFMV() const; + + /// Returns the MultiVersion priority of a given function. + uint64_t getFMVPriority(Function &F) const; + + /// Returns the symbol which contains the cpu feature mask used by + /// the Function MultiVersioning resolver. + GlobalVariable *getCPUFeatures(Module &M) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2152,6 +2162,9 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual bool hasFMV() const = 0; + virtual uint64_t getFMVPriority(Function &F) const = 0; + virtual GlobalVariable *getCPUFeatures(Module &M) const = 0; virtual unsigned getMaxNumArgs() const = 0; }; @@ -2904,6 +2917,16 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + bool hasFMV() const override { return Impl.hasFMV(); } + + uint64_t getFMVPriority(Function &F) const override { + return Impl.getFMVPriority(F); + } + + GlobalVariable *getCPUFeatures(Module &M) const override { + return Impl.getCPUFeatures(M); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 63c2ef8912b29c..746c09f0d50370 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -941,6 +941,12 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + bool hasFMV() const { return false; } + + uint64_t getFMVPriority(Function &F) const { return 0; } + + GlobalVariable *getCPUFeatures(Module &M) const { return nullptr; } + unsigned getMaxNumArgs() const { return UINT_MAX; } protected: diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5f933b4587843c..39da6cc4445759 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1296,6 +1296,16 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +bool TargetTransformInfo::hasFMV() const { return TTIImpl->hasFMV(); } + +uint64_t TargetTransformInfo::getFMVPriority(Function &F) const { + return TTIImpl->getFMVPriority(F); +} + +GlobalVariable *TargetTransformInfo::getCPUFeatures(Module &M) const { + return TTIImpl->getCPUFeatures(M); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ee7137b92445bb..a92f859b59a3de 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -231,6 +232,17 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFMVPriority(Function &F) const { + StringRef FeatureStr = F.getFnAttribute("target-features").getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getCpuSupportsMask(Features); +} + +GlobalVariable *AArch64TTIImpl::getCPUFeatures(Module &M) const { + return M.getGlobalVariable("__aarch64_cpu_features"); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index de39dea2be43e1..51ad79690679f5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -83,6 +83,12 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + bool hasFMV() const { return ST->hasFMV(); } + + uint64_t getFMVPriority(Function &F) const; + + GlobalVariable *getCPUFeatures(Module &M) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 71099462d5ecff..7a3d2fc5f0c9db 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -50,8 +50,13 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA uint64_t AArch64::getCpuSupportsMask(ArrayRef FeatureStrs) { uint64_t FeaturesMask = 0; for (const StringRef &FeatureStr : FeatureStrs) { - if (auto Ext = parseArchExtension(FeatureStr)) - FeaturesMask |= (1ULL << Ext->CPUFeature); + StringRef Feat = resolveExtAlias(FeatureStr); + for (const auto &E : Extensions) { + if (Feat == E.Name || Feat == E.Feature) { + FeaturesMask |= (1ULL << E.CPUFeature); + break; + } + } } return FeaturesMask; } diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index da714c9a75701b..f41905c4e77c41 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -89,7 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed"); STATISTIC(NumInternalFunc, "Number of internal functions"); STATISTIC(NumColdCC, "Number of functions marked coldcc"); -STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs"); +STATISTIC(NumIFuncsResolved, "Number of resolved IFuncs"); STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed"); static cl::opt @@ -2462,6 +2462,228 @@ DeleteDeadIFuncs(Module &M, return Changed; } +static Function *foldResolverForCallSite(CallBase *CS, uint64_t Priority, + TargetTransformInfo &TTI) { + // Look for the instruction which feeds the feature mask to the users. + auto findRoot = [&TTI](Function *F) -> Instruction * { + for (Instruction &I : F->getEntryBlock()) + if (auto *Load = dyn_cast(&I)) + if (Load->getPointerOperand() == TTI.getCPUFeatures(*F->getParent())) + return Load; + return nullptr; + }; + + auto *IF = cast(CS->getCalledOperand()); + Instruction *Root = findRoot(IF->getResolverFunction()); + // There is no such instruction. Bail. + if (!Root) + return nullptr; + + // Create a constant mask to use as seed for the constant propagation. + Constant *Seed = Constant::getIntegerValue( + Root->getType(), APInt(Root->getType()->getIntegerBitWidth(), Priority)); + + auto DL = CS->getModule()->getDataLayout(); + + // Recursively propagate on single use chains. + std::function + constFoldInst = [&](Instruction *I, Instruction *Use, Constant *C, + BasicBlock *Pred) -> Constant * { + // Base case. + if (auto *Ret = dyn_cast(I)) + if (Ret->getReturnValue() == Use) + return C; + + // Minimal set of instruction types to handle. + if (auto *BinOp = dyn_cast(I)) { + bool Swap = BinOp->getOperand(1) == Use; + if (auto *Other = dyn_cast(BinOp->getOperand(Swap ? 0 : 1))) + C = Swap ? ConstantFoldBinaryInstruction(BinOp->getOpcode(), Other, C) + : ConstantFoldBinaryInstruction(BinOp->getOpcode(), C, Other); + } else if (auto *Cmp = dyn_cast(I)) { + bool Swap = Cmp->getOperand(1) == Use; + if (auto *Other = dyn_cast(Cmp->getOperand(Swap ? 0 : 1))) + C = Swap ? ConstantFoldCompareInstOperands(Cmp->getPredicate(), Other, + C, DL) + : ConstantFoldCompareInstOperands(Cmp->getPredicate(), C, + Other, DL); + } else if (auto *Sel = dyn_cast(I)) { + if (Sel->getCondition() == Use) + C = dyn_cast(C->isZeroValue() ? Sel->getFalseValue() + : Sel->getTrueValue()); + } else if (auto *Phi = dyn_cast(I)) { + if (Pred) + C = dyn_cast(Phi->getIncomingValueForBlock(Pred)); + } else if (auto *Br = dyn_cast(I)) { + if (Br->getCondition() == Use) { + BasicBlock *BB = Br->getSuccessor(C->isZeroValue()); + return constFoldInst(&BB->front(), Root, Seed, Br->getParent()); + } + } else { + // Don't know how to handle. Bail. + return nullptr; + } + + // Folding succeeded. Continue. + if (C && I->hasOneUse()) + if (auto *UI = dyn_cast(I->user_back())) + return constFoldInst(UI, I, C, nullptr); + + return nullptr; + }; + + // Collect all users in the entry block ordered by proximity. The rest of + // them can be discovered later. Unfortunately we cannot simply traverse + // the Root's 'users()' as their order is not the same as execution order. + unsigned NUsersLeft = std::distance(Root->user_begin(), Root->user_end()); + SmallVector Users; + for (Instruction &I : *Root->getParent()) { + if (any_of(I.operands(), [Root](auto &Op) { return Op == Root; })) { + Users.push_back(&I); + if (--NUsersLeft == 0) + break; + } + } + + // Return as soon as we find a foldable user. It has the highest priority. + for (Instruction *I : Users) { + Constant *C = constFoldInst(I, Root, Seed, nullptr); + if (C) + return cast(C); + } + + return nullptr; +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked because +// there is no corresponding caller for whom it would have been the best +// match. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + std::function &)> visitValue = + [&](Value *V, SmallVectorImpl &FuncVersions) { + if (auto *Func = dyn_cast(V)) { + FuncVersions.push_back(Func); + } else if (auto *Sel = dyn_cast(V)) { + visitValue(Sel->getTrueValue(), FuncVersions); + visitValue(Sel->getFalseValue(), FuncVersions); + } else if (auto *Phi = dyn_cast(V)) + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + visitValue(Phi->getIncomingValue(I), FuncVersions); + }; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeaturePriorityMap; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + if (!TTI.hasFMV()) + return false; + + // Discover the callee versions. + SmallVector Callees; + for (BasicBlock &BB : *Resolver) + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + visitValue(Ret->getReturnValue(), Callees); + + if (Callees.empty()) + continue; + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeaturePriorityMap.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFMVPriority(*Callee); + } + + // Sort the callee versions in increasing feature priority order. + // Every time we find a caller that matches the highest priority + // callee we pop_back() one from this ordered list. + llvm::stable_sort(Callees, [&](auto *LHS, auto *RHS) { + return FeaturePriorityMap[LHS] < FeaturePriorityMap[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector CallSites; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [It, Inserted] = FeaturePriorityMap.try_emplace(Caller); + if (Inserted) + It->second = TTI.getFMVPriority(*Caller); + CallSites.push_back(CB); + } + } + } + + // Sort the callsites in decreasing feature priority order. + llvm::stable_sort(CallSites, [&](auto *LHS, auto *RHS) { + return FeaturePriorityMap[LHS->getFunction()] > + FeaturePriorityMap[RHS->getFunction()]; + }); + + // Now try to constant fold the resolver for every callsite starting + // from higher priority callers. This guarantees that as soon as we + // find a callee whose priority is lower than the expected best match + // then there is no point in continuing further. + DenseMap foldedResolverCache; + for (CallBase *CS : CallSites) { + uint64_t CallerPriority = FeaturePriorityMap[CS->getFunction()]; + auto [It, Inserted] = foldedResolverCache.try_emplace(CallerPriority); + Function *&Callee = It->second; + if (Inserted) + Callee = foldResolverForCallSite(CS, CallerPriority, TTI); + if (Callee) { + if (!Callees.empty()) { + // If the priority of the candidate is greater or equal to + // the expected best match then it shall be picked. Otherwise + // there is a higher priority callee without a corresponding + // caller, in which case abort. + uint64_t CalleePriority = FeaturePriorityMap[Callee]; + if (CalleePriority == FeaturePriorityMap[Callees.back()]) + Callees.pop_back(); + else if (CalleePriority < FeaturePriorityMap[Callees.back()]) + break; + } + CS->setCalledOperand(Callee); + Changed = true; + } else { + // Oops, something went wrong. We couldn't fold. Abort. + break; + } + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2525,6 +2747,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 00000000000000..bcc73c8e44970f --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,211 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver)" --version 4 +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$foo.resolver = comdat any +$bar.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver.ifunc = weak_odr alias i32 (), ptr @test_single_bb_resolver +@test_multi_bb_resolver.ifunc = weak_odr dso_local alias i32 (), ptr @test_multi_bb_resolver +@foo.ifunc = weak_odr alias i32 (), ptr @foo +@bar.ifunc = weak_odr dso_local alias i32 (), ptr @bar + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr dso_local ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@foo = weak_odr ifunc i32 (), ptr @foo.resolver +@bar = weak_odr dso_local ifunc i32 (), ptr @bar.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver._Msve() #2 + +declare i32 @test_single_bb_resolver._Msve2() #3 + +define i32 @test_single_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_single_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_single_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @foo._Msve() #2 { +; CHECK-LABEL: define i32 @foo._Msve( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @foo._Msve2() #3 { +; CHECK-LABEL: define i32 @foo._Msve2( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL1:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; CHECK: [[CALL2:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call1 = tail call i32 @test_single_bb_resolver() + %call2 = tail call i32 @test_single_bb_resolver() + %added = add nsw i32 %call1, %call2 + %add = add nsw i32 %added, 20 + ret i32 %add +} + +define i32 @foo.default() #1 { +; CHECK-LABEL: define i32 @foo.default( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @foo.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @foo.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %foo._Msve.foo.default = select i1 %.not3, ptr @foo.default, ptr @foo._Msve + %common.ret.op = select i1 %.not, ptr %foo._Msve.foo.default, ptr @foo._Msve2 + ret ptr %common.ret.op +} + +define i32 @test_multi_bb_resolver._Mmops() #4 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Mmops( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +entry: + ret i32 3 +} + +define i32 @test_multi_bb_resolver._Msve2() #3 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve2( +; CHECK-SAME: ) #[[ATTR1]] { +entry: + ret i32 2 +} + +define i32 @test_multi_bb_resolver._Msve() #2 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +entry: + ret i32 1 +} + +define i32 @test_multi_bb_resolver.default() #1 { +; CHECK-LABEL: define i32 @test_multi_bb_resolver.default( +; CHECK-SAME: ) #[[ATTR2]] { +entry: + ret i32 0 +} + +define weak_odr ptr @test_multi_bb_resolver.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @bar._MmopsMsve2() #5 { +; CHECK-LABEL: define i32 @bar._MmopsMsve2( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 30 + ret i32 %add +} + +define i32 @bar._Msve() #2 { +; CHECK-LABEL: define i32 @bar._Msve( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 20 + ret i32 %add +} + +define i32 @bar.default() #1 { +; CHECK-LABEL: define i32 @bar.default( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + %add = add nsw i32 %call, 10 + ret i32 %add +} + +define weak_odr ptr @bar.resolver() #0 comdat { +; CHECK-LABEL: define weak_odr ptr @bar.resolver( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460821022900224 + %2 = icmp eq i64 %1, 576460821022900224 + %3 = and i64 %0, 1073741824 + %.not = icmp eq i64 %3, 0 + %bar._Msve.bar.default = select i1 %.not, ptr @bar.default, ptr @bar._Msve + %common.ret.op = select i1 %2, ptr @bar._MmopsMsve2, ptr %bar._Msve.bar.default + ret ptr %common.ret.op +} + +attributes #0 = { "target-features"="+fmv" } +attributes #1 = { "target-features"="+fmv,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #2 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" } +attributes #3 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" } +attributes #4 = { "target-features"="+fmv,+fp-armv8,+mops,+neon,+outline-atomics,+v8a" } +attributes #5 = { "target-features"="+fmv,+fp-armv8,+fullfp16,+mops,+neon,+outline-atomics,+sve,+sve2,+v8a" } +