[TailCallElim] Preserve DT and PDT

Summary: Previously, in the NewPM pipeline, TailCallElim recalculates the DomTree when it modifies any instruction in the Function. For example, ``` CallInst *CI = dyn_cast<CallInst>(&I); ... CI->setTailCall(); Modified = true; ... if (!Modified || ...) return PreservedAnalyses::all(); ``` After applying this patch, the DomTree only recalculates if needed (plus an extra insertEdge() + an extra deleteEdge() call). When optimizing SQLite with `-passes="default<O3>"` pipeline of the newPM, the number of DomTree recalculation decreases by 6.2%, the number of nodes visited by DFS decreases by 2.9%. The time used by DomTree will decrease approximately 1%~2.5% after applying the patch. Statistics: ``` Before the patch: 23010 dom-tree-stats - Number of DomTree recalculations 489264 dom-tree-stats - Number of nodes visited by DFS -- DomTree After the patch: 21581 dom-tree-stats - Number of DomTree recalculations 475088 dom-tree-stats - Number of nodes visited by DFS -- DomTree ``` Reviewers: kuhar, dmgreen, brzycki, grosser, davide Reviewed By: kuhar, brzycki Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D49982 llvm-svn: 338954
2025-01-31 12:41:49 +01:00 · 2018-08-04 08:13:47 +00:00 · 2018-08-04 08:13:47 +00:00 · 28625d59d9
commit 28625d59d9
parent 4c80642b2c
16 changed files with 71 additions and 42 deletions
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@ -229,7 +229,8 @@ void SplitLandingPadPredecessors(BasicBlock *OrigBB,
 /// value defined by a PHI, propagate the right value into the return. It
 /// returns the new return instruction in the predecessor.
 ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
-                                       BasicBlock *Pred);
+                                       BasicBlock *Pred,
+                                       DomTreeUpdater *DTU = nullptr);

 /// Split the containing block at the specified instruction - everything before
 /// SplitBefore stays in the old basic block, and the rest of the instructions
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@ -61,6 +61,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
@ -68,6 +69,8 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DomTreeUpdater.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@ -488,12 +491,10 @@ static CallInst *findTRECandidate(Instruction *TI,
  return CI;
 }

-static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
-                                       BasicBlock *&OldEntry,
-                                       bool &TailCallsAreMarkedTail,
-                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                       AliasAnalysis *AA,
-                                       OptimizationRemarkEmitter *ORE) {
+static bool eliminateRecursiveTailCall(
+    CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,
+    bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
  // If we are introducing accumulator recursion to eliminate operations after
  // the call instruction that are both associative and commutative, the initial
  // value for the accumulator is placed in this variable.  If this value is set
@ -593,6 +594,10 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
      PN->addIncoming(&*I, NewEntry);
      ArgumentPHIs.push_back(PN);
    }
+    // The entry block was changed from OldEntry to NewEntry.
+    // The forward DominatorTree needs to be recalculated when the EntryBB is
+    // changed. In this corner-case we recalculate the entire tree.
+    DTU.recalculate(*NewEntry->getParent());
  }

  // If this function has self recursive calls in the tail position where some
@ -668,6 +673,7 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,

  BB->getInstList().erase(Ret);  // Remove return.
  BB->getInstList().erase(CI);   // Remove call.
+  DTU.insertEdge(BB, OldEntry);
  ++NumEliminated;
  return true;
 }
@ -676,7 +682,7 @@ static bool foldReturnAndProcessPred(
    BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
    bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
    bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
-    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) {
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
  bool Change = false;

  // Make sure this block is a trivial return block.
@ -702,17 +708,17 @@ static bool foldReturnAndProcessPred(
    if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
      LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
                        << "INTO UNCOND BRANCH PRED: " << *Pred);
-      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);

      // Cleanup: if all predecessors of BB have been eliminated by
      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
      // because the ret instruction in there is still using a value which
      // eliminateRecursiveTailCall will attempt to remove.
      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
-        BB->eraseFromParent();
+        DTU.deleteBB(BB);

      eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
-                                 ArgumentPHIs, AA, ORE);
+                                 ArgumentPHIs, AA, ORE, DTU);
      ++NumRetDuped;
      Change = true;
    }
@ -721,24 +727,23 @@ static bool foldReturnAndProcessPred(
  return Change;
 }

-static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
-                                  bool &TailCallsAreMarkedTail,
-                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                  bool CannotTailCallElimCallsMarkedTail,
-                                  const TargetTransformInfo *TTI,
-                                  AliasAnalysis *AA,
-                                  OptimizationRemarkEmitter *ORE) {
+static bool processReturningBlock(
+    ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail,
+    SmallVectorImpl<PHINode *> &ArgumentPHIs,
+    bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
+    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
  if (!CI)
    return false;

  return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
-                                    ArgumentPHIs, AA, ORE);
+                                    ArgumentPHIs, AA, ORE, DTU);
 }

 static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
                                   AliasAnalysis *AA,
-                                   OptimizationRemarkEmitter *ORE) {
+                                   OptimizationRemarkEmitter *ORE,
+                                   DomTreeUpdater &DTU) {
  if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
    return false;

@ -773,11 +778,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
      bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
                                          ArgumentPHIs, !CanTRETailMarkedCall,
-                                          TTI, AA, ORE);
+                                          TTI, AA, ORE, DTU);
      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
-                                          TailCallsAreMarkedTail, ArgumentPHIs,
-                                          !CanTRETailMarkedCall, TTI, AA, ORE);
+        Change = foldReturnAndProcessPred(
+            BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
+            !CanTRETailMarkedCall, TTI, AA, ORE, DTU);
      MadeChange |= Change;
    }
  }
@ -810,16 +815,27 @@ struct TailCallElim : public FunctionPass {
    AU.addRequired<AAResultsWrapperPass>();
    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
  }

  bool runOnFunction(Function &F) override {
    if (skipFunction(F))
      return false;

+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+    auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    // There is no noticable performance difference here between Lazy and Eager
+    // UpdateStrategy based on some test results. It is feasible to switch the
+    // UpdateStrategy to Lazy if we find it profitable later.
+    DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+
    return eliminateTailRecursion(
        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
        &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
+        &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
  }
 };
 }
@ -843,12 +859,19 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
  AliasAnalysis &AA = AM.getResult<AAManager>(F);
  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
-  bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE);
+  auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  // There is no noticable performance difference here between Lazy and Eager
+  // UpdateStrategy based on some test results. It is feasible to switch the
+  // UpdateStrategy to Lazy if we find it profitable later.
+  DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+  bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE, DTU);

  if (!Changed)
    return PreservedAnalyses::all();
  PreservedAnalyses PA;
  PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
  return PA;
 }
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@ -646,7 +646,8 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
 }

 ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
-                                             BasicBlock *Pred) {
+                                             BasicBlock *Pred,
+                                             DomTreeUpdater *DTU) {
  Instruction *UncondBranch = Pred->getTerminator();
  // Clone the return and add it to the end of the predecessor.
  Instruction *NewRet = RI->clone();
@ -680,6 +681,10 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
  // longer branch to them.
  BB->removePredecessor(Pred);
  UncondBranch->eraseFromParent();
+
+  if (DTU)
+    DTU->deleteEdge(Pred, BB);
+
  return cast<ReturnInst>(NewRet);
 }

--- a/test/Transforms/TailCallElim/2010-06-26-MultipleReturnValues.ll
+++ b/test/Transforms/TailCallElim/2010-06-26-MultipleReturnValues.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
 ; PR7328
 ; PR7506
 define i32 @foo(i32 %x) {
--- a/test/Transforms/TailCallElim/EraseBB.ll
+++ b/test/Transforms/TailCallElim/EraseBB.ll
@ -1,4 +1,4 @@
-; RUN: opt -tailcallelim -S < %s 2>&1 | FileCheck %s
+; RUN: opt -tailcallelim -verify-dom-info -S < %s 2>&1 | FileCheck %s

 ; CHECK: add nsw i32
 ; CHECK-NEXT: br label
--- a/test/Transforms/TailCallElim/accum_recursion.ll
+++ b/test/Transforms/TailCallElim/accum_recursion.ll
@ -1,5 +1,5 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
-; RUN: opt < %s -passes=tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
+; RUN: opt < %s -passes=tailcallelim -verify-dom-info -S | FileCheck %s

 define i32 @test1_factorial(i32 %x) {
 entry:
--- a/test/Transforms/TailCallElim/ackermann.ll
+++ b/test/Transforms/TailCallElim/ackermann.ll
@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; This function contains two tail calls, which should be eliminated
-; RUN: opt < %s -tailcallelim -stats -disable-output 2>&1 | grep "2 tailcallelim"
+; RUN: opt < %s -tailcallelim -verify-dom-info -stats -disable-output 2>&1 | grep "2 tailcallelim"

 define i32 @Ack(i32 %M.1, i32 %N.1) {
 entry:
--- a/test/Transforms/TailCallElim/basic.ll
+++ b/test/Transforms/TailCallElim/basic.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s

 declare void @noarg()
 declare void @use(i32*)
--- a/test/Transforms/TailCallElim/deopt-bundle.ll
+++ b/test/Transforms/TailCallElim/deopt-bundle.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s

 define i32 @f_1(i32 %x) {
 ; CHECK-LABEL: @f_1(
--- a/test/Transforms/TailCallElim/dont_reorder_load.ll
+++ b/test/Transforms/TailCallElim/dont_reorder_load.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | grep call | count 4
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | grep call | count 4
 ; PR4323

 ; Several cases where tail call elimination should not move the load above the
--- a/test/Transforms/TailCallElim/dup_tail.ll
+++ b/test/Transforms/TailCallElim/dup_tail.ll
@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; Duplicate the return into if.end to enable TCE.
-; RUN: opt -tailcallelim -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -tailcallelim -verify-dom-info -stats -disable-output < %s 2>&1 | FileCheck %s

 ; CHECK: Number of return duplicated

--- a/test/Transforms/TailCallElim/inf-recursion.ll
+++ b/test/Transforms/TailCallElim/inf-recursion.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s

 ; Don't turn this into an infinite loop, this is probably the implementation
 ; of fabs and we expect the codegen to lower fabs.
--- a/test/Transforms/TailCallElim/notail.ll
+++ b/test/Transforms/TailCallElim/notail.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s

 ; CHECK: tail call void @callee0()
 ; CHECK: notail call void @callee1()
--- a/test/Transforms/TailCallElim/opt-remarks-recursion.ll
+++ b/test/Transforms/TailCallElim/opt-remarks-recursion.ll
@ -1,4 +1,4 @@
-; RUN: opt %s -tailcallelim -pass-remarks=tailcallelim -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt %s -tailcallelim -verify-dom-info -pass-remarks=tailcallelim -o /dev/null 2>&1 | FileCheck %s
 ; RUN: opt %s -o /dev/null -passes='require<opt-remark-emit>,tailcallelim' -pass-remarks=tailcallelim 2>&1 | FileCheck %s

 ; CHECK: /home/davide/pat.c:2:20: transforming tail recursion into loop
--- a/test/Transforms/TailCallElim/reorder_load.ll
+++ b/test/Transforms/TailCallElim/reorder_load.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
 ; PR4323

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
--- a/test/Transforms/TailCallElim/setjmp.ll
+++ b/test/Transforms/TailCallElim/setjmp.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | FileCheck %s
+; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s

 ; Test that we don't tail call in a functions that calls returns_twice
 ; functions.