Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[llvm][GlobalOpt] Optimize statically resolvable IFuncs #80606

Merged
merged 10 commits into from
Feb 6, 2024

Conversation

jroelofs
Copy link
Contributor

@jroelofs jroelofs commented Feb 4, 2024

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Feb 4, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Jon Roelofs (jroelofs)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/80606.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/IPO/GlobalOpt.cpp (+40)
  • (added) llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll (+81)
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 951372adcfa93..8fce21a9f9dd9 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -89,6 +89,7 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
+STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
 
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2404,6 +2405,42 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   return Changed;
 }
 
+static Function *hasSideeffectFreeStaticResolution(GlobalIFunc &IF) {
+  Function *Resolver = IF.getResolverFunction();
+  if (!Resolver)
+    return nullptr;
+
+  Function *Callee = nullptr;
+  for (BasicBlock &BB : *Resolver) {
+    if (any_of(BB, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return nullptr;
+
+    if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+      if (auto *F = dyn_cast<Function>(Ret->getReturnValue())) {
+        if (!Callee)
+          Callee = F;
+        if (F != Callee)
+          return nullptr;
+      }
+  }
+
+  return Callee;
+}
+
+/// Find IFuncs that have resolvers that always point at the same statically
+/// known callee, and replace their callers with a direct call.
+static bool OptimizeStaticIFuncs(Module &M) {
+  bool Changed = false;
+  for (GlobalIFunc &IF : M.ifuncs())
+    if (Function *Callee = hasSideeffectFreeStaticResolution(IF))
+      if (!IF.use_empty()) {
+        IF.replaceAllUsesWith(Callee);
+        NumIFuncsResolved++;
+        Changed = true;
+      }
+  return Changed;
+}
+
 static bool
 optimizeGlobalsInModule(Module &M, const DataLayout &DL,
                         function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2464,6 +2501,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL,
     if (CXAAtExitFn)
       LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
 
+    // Optimize IFuncs whose callee's are statically known.
+    LocalChange |= OptimizeStaticIFuncs(M);
+
     Changed |= LocalChange;
   }
 
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll
new file mode 100644
index 0000000000000..d51e9111ad2cd
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll
@@ -0,0 +1,81 @@
+; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+$callee_with_trivial_resolver.resolver = comdat any
+@callee_with_trivial_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_trivial_resolver.resolver
+define weak_odr ptr @callee_with_trivial_resolver.resolver() comdat {
+  ret ptr @callee_with_trivial_resolver._Msimd
+}
+define void @callee_with_trivial_resolver._Msimd() {
+  ret void
+}
+define void @callee_with_trivial_resolver.default() {
+  ret void
+}
+
+@unknown_condition = external global i1
+$callee_with_complex_static_resolver.resolver = comdat any
+@callee_with_complex_static_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_complex_static_resolver.resolver
+define weak_odr ptr @callee_with_complex_static_resolver.resolver() comdat {
+entry:
+  %v = load i1, ptr @unknown_condition
+  br i1 %v, label %fast, label %slow
+fast:
+  ret ptr @callee_with_complex_static_resolver._Msimd
+slow:
+  ret ptr @callee_with_complex_static_resolver._Msimd
+}
+define void @callee_with_complex_static_resolver._Msimd() {
+  ret void
+}
+define void @callee_with_complex_static_resolver.default() {
+  ret void
+}
+
+$callee_with_complex_dynamic_resolver.resolver = comdat any
+@callee_with_complex_dynamic_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_complex_dynamic_resolver.resolver
+define weak_odr ptr @callee_with_complex_dynamic_resolver.resolver() comdat {
+entry:
+  %v = load i1, ptr @unknown_condition
+  br i1 %v, label %fast, label %slow
+fast:
+  ret ptr @callee_with_complex_dynamic_resolver._Msimd
+slow:
+  ret ptr @callee_with_complex_dynamic_resolver.default
+}
+define void @callee_with_complex_dynamic_resolver._Msimd() {
+  ret void
+}
+define void @callee_with_complex_dynamic_resolver.default() {
+  ret void
+}
+
+$callee_with_sideeffects_resolver.resolver = comdat any
+@callee_with_sideeffects_resolver.ifunc = weak_odr dso_local ifunc void (), ptr @callee_with_sideeffects_resolver.resolver
+define weak_odr ptr @callee_with_sideeffects_resolver.resolver() comdat {
+  store i1 0, ptr @unknown_condition
+  ret ptr @callee_with_sideeffects_resolver.default
+}
+define void @callee_with_sideeffects_resolver._Msimd() {
+  ret void
+}
+define void @callee_with_sideeffects_resolver.default() {
+  ret void
+}
+
+define void @caller() {
+  call void @callee_with_trivial_resolver.ifunc()
+  call void @callee_with_complex_static_resolver.ifunc()
+  call void @callee_with_complex_dynamic_resolver.ifunc()
+  call void @callee_with_sideeffects_resolver.ifunc()
+  ret void
+}
+
+; CHECK-LABEL: define void @caller()
+; CHECK-NEXT:     call void @callee_with_trivial_resolver._Msimd()
+; CHECK-NEXT:     call void @callee_with_complex_static_resolver._Msimd()
+; CHECK-NEXT:     call void @callee_with_complex_dynamic_resolver.ifunc()
+; CHECK-NEXT:     call void @callee_with_sideeffects_resolver.ifunc()
+; CHECK-NEXT:     ret void

@jroelofs
Copy link
Contributor Author

jroelofs commented Feb 4, 2024

This is a baby step towards #71714

@jroelofs
Copy link
Contributor Author

jroelofs commented Feb 4, 2024

And here's an end-to-end example where it would apply: https://clang.godbolt.org/z/PYx6jT4W5

@nikic nikic self-requested a review February 4, 2024 17:16
@jroelofs jroelofs force-pushed the jroelofs/optimize-ifunc branch from f85979e to 5318a4d Compare February 4, 2024 18:22
return nullptr;

Function *Callee = nullptr;
for (BasicBlock &BB : *Resolver) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't account for divergence effects (infinite loops). I'd recommend to only handle the case with a single basic block and rely on preceding simplification to reduce it to that form. (Unless there is a phase-ordering reason to believe this will not happen?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great point. I have a follow-up in mind I should ask your advice on then:

I want to extend this (in subsequent patch(es)) to de-virtualize call sites when the result is not quite as obvious. Looking at attributes on the caller will give us some known bits on __aarch64_cpu_features.features, and if that leads to the resolver being constant-foldable in the context of that caller, we can make a direct call.

Would you clone the resolver for each call-site, RAUW the loads from __aarch64_cpu_features.features with __aarch64_cpu_features.features | known_bits and then InstCombine+SimplifyCFG, followed by reverting the call site back to the original resolver if it didn't work out & DCE-ing the remaining cruft? Or would you attempt to do a more analysis/reasoning-based approach?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd go for a symbolic evaluation approach. Basically go through the instructions, try to constant fold (or instsimplify) them, plus handle the specific feature pattern you are looking for. GlobalOpt does something similar for evaluating global_ctors (https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/Utils/Evaluator.cpp), though I expect that what you need here would be much simpler. (The hard part of evaluating global_ctors is mutating global state, which presumably isn't relevant here.)

Doing this kind of speculative optimization + rollback is something we try to avoid, as it is expensive. I don't think we use function cloning for this purpose anywhere. (IPSCCP does clone functions, but not as an analysis mechanism.)

Copy link
Collaborator

@labrinea labrinea Apr 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tried a few things to achieve what @jroelofs is describing but none of them seems ideal:

  • extend IPSCCP to create specializations of the resolver
  • constant fold the resolver per callsite in InstCombineCalls.cpp

The second attempt looks better but I am not sure Instcombine allows such expensive transformations. Another concern is that when constant folding I am handling a few types of instructions which I know the FMV resolver on AArch64 will have (binop, compare, select, branch, phi, return). That's quite specific - not generic enough. Also having to use the AArch64 Target Parser to create a constant mask, or looking for loads from address __aarch64_cpu_features doesn't fit well in a target independent optimization. Thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also having to use the AArch64 Target Parser to create a constant mask, or looking for loads from address __aarch64_cpu_features doesn't fit well in a target independent optimization.

These feel like they could be served by TTI hooks.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI #87939

llvm/lib/Transforms/IPO/GlobalOpt.cpp Show resolved Hide resolved
llvm/lib/Transforms/IPO/GlobalOpt.cpp Outdated Show resolved Hide resolved
llvm/lib/Transforms/IPO/GlobalOpt.cpp Show resolved Hide resolved
llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll Outdated Show resolved Hide resolved
llvm/test/Transforms/GlobalOpt/resolve-static-ifunc.ll Outdated Show resolved Hide resolved
Copy link

github-actions bot commented Feb 5, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@jroelofs jroelofs merged commit e976385 into llvm:main Feb 6, 2024
3 of 4 checks passed
@jroelofs jroelofs deleted the jroelofs/optimize-ifunc branch February 6, 2024 21:59
jroelofs added a commit to swiftlang/llvm-project that referenced this pull request Feb 12, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants