[CGP] Duplicate addressing computation in cold paths if required to sink addressing mode

This patch teaches CGP to duplicate addressing mode computations into cold paths (detected via explicit cold attribute on calls) if required to let addressing mode be safely sunk into the basic block containing each load and store. In general, duplicating code into cold blocks may result in code growth, but should not effect performance. In this case, it's better to duplicate some code than to put extra pressure on the register allocator by making it keep the address through the entirely of the fast path. This patch only handles addressing computations, but in principal, we could implement a more general cold cold scheduling heuristic which tries to reduce register pressure in the fast path by duplicating code into the cold path. Getting the profitability of the general case right seemed likely to be challenging, so I stuck to the existing case (addressing computation) we already had. Differential Revision: http://reviews.llvm.org/D17652 llvm-svn: 263074
2024-10-19 11:02:59 +02:00 · 2016-03-09 23:13:12 +00:00 · 2016-03-09 23:13:12 +00:00 · e6c29ed949
commit e6c29ed949
parent 2413b9d203
2 changed files with 241 additions and 8 deletions
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@ -1760,6 +1760,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
    }
  }
  // If we have a cold call site, try to sink addressing computation into the
  // cold block.  This interacts with our handling for loads and stores to
  // ensure that we can fold all uses of a potential addressing computation
  // into their uses.  TODO: generalize this to work over profiling data
  if (!OptSize && CI->hasFnAttr(Attribute::Cold))
    for (auto &Arg : CI->arg_operands()) {
      if (!Arg->getType()->isPointerTy())
        continue;
      unsigned AS = Arg->getType()->getPointerAddressSpace();
      return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
    }
  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
  if (II) {
    switch (II->getIntrinsicID()) {
@ -3443,6 +3455,8 @@ static bool FindAllMemoryUses(
  if (!MightBeFoldableInst(I))
    return true;
  const bool OptSize = I->getFunction()->optForSize();
  // Loop over all the uses, recursively processing them.
  for (Use &U : I->uses()) {
    Instruction *UserI = cast<Instruction>(U.getUser());
@ -3460,6 +3474,11 @@ static bool FindAllMemoryUses(
    }
    if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
      // If this is a cold call, we can sink the addressing calculation into
      // the cold path.  See optimizeCallInst
      if (!OptSize && CI->hasFnAttr(Attribute::Cold))
        continue;
      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
      if (!IA) return true;
@ -3551,10 +3570,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
  if (!BaseReg && !ScaledReg)
    return true;
-  // If all uses of this instruction are ultimately load/store/inlineasm's,
+  // If all uses of this instruction can have the address mode sunk into them,
-  // check to see if their addressing modes will include this instruction.  If
+  // we can remove the addressing mode and effectively trade one live register
-  // so, we can fold it into all uses, so it doesn't matter if it has multiple
+  // for another (at worst.)  In this context, folding an addressing mode into
-  // uses.
+  // the use is just a particularly nice way of sinking it.  
  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
  SmallPtrSet<Instruction*, 16> ConsideredInsts;
  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
@ -3562,8 +3581,13 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
  // Now that we know that all uses of this instruction are part of a chain of
  // computation involving only operations that could theoretically be folded
-  // into a memory use, loop over each of these uses and see if they could
+  // into a memory use, loop over each of these memory operation uses and see
-  // *actually* fold the instruction.
+  // if they could  *actually* fold the instruction.  The assumption is that
  // addressing modes are cheap and that duplicating the computation involved
  // many times is worthwhile, even on a fastpath. For sinking candidates
  // (i.e. cold call sites), this serves as a way to prevent excessive code
  // growth since most architectures have some reasonable small and fast way to
  // compute an effective address.  (i.e LEA on x86)
  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
    Instruction *User = MemoryUses[i].first;
@ -3617,6 +3641,11 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
  return false;
 }
 /// Sink addressing mode computation immediate before MemoryInst if doing so
 /// can be done without increasing register pressure.  The need for the
 /// register pressure constraint means this can end up being an all or nothing
 /// decision for all uses of the same addressing computation.
 ///
 /// Load and Store Instructions often have addressing modes that can do
 /// significant amounts of computation. As such, instruction selection will try
 /// to get the load or store to do as much computation as possible for the
@ -3624,7 +3653,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
 /// such, we sink as much legal addressing mode work into the block as possible.
 ///
 /// This method is used to optimize both load/store and inline asms with memory
-/// operands.
+/// operands.  It's also used to sink addressing computations feeding into cold
 /// call sites into their (cold) basic block.
 ///
 /// The motivation for handling sinking into cold blocks is that doing so can
 /// both enable other address mode sinking (by satisfying the register pressure
 /// constraint above), and reduce register pressure globally (by removing the
 /// addressing mode computation from the fast path entirely.).
 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                        Type *AccessTy, unsigned AddrSpace) {
  Value *Repl = Addr;
@ -3663,7 +3698,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
      continue;
    }
-    // For non-PHIs, determine the addressing mode being computed.
+    // For non-PHIs, determine the addressing mode being computed.  Note that
    // the result may differ depending on what other uses our candidate
    // addressing instructions might have.
    SmallVector<Instruction*, 16> NewAddrModeInsts;
    ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@ -0,0 +1,196 @@
 ; RUN: opt -S -codegenprepare < %s | FileCheck %s
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 ; Can we sink single addressing mode computation to use?
 define void @test1(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test1
 ; CHECK: add i64 {{.+}}, 40
 entry:
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
  %v = load i32, i32* %casted, align 4
  br label %fallthrough
 fallthrough:
  ret void
 }
 declare void @foo(i32)
 ; Make sure sinking two copies of addressing mode into different blocks works
 define void @test2(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test2
 entry:
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
 ; CHECK-LABEL: if.then:
 ; CHECK: add i64 {{.+}}, 40
  %v1 = load i32, i32* %casted, align 4
  call void @foo(i32 %v1)
  %cmp = icmp eq i32 %v1, 0
  br i1 %cmp, label %next, label %fallthrough
 next:
 ; CHECK-LABEL: next:
 ; CHECK: add i64 {{.+}}, 40
  %v2 = load i32, i32* %casted, align 4
  call void @foo(i32 %v2)
  br label %fallthrough
 fallthrough:
  ret void
 }
 ; If we have two loads in the same block, only need one copy of addressing mode
 ; - instruction selection will duplicate if needed
 define void @test3(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test3
 entry:
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
 ; CHECK-LABEL: if.then:
 ; CHECK: add i64 {{.+}}, 40
  %v1 = load i32, i32* %casted, align 4
  call void @foo(i32 %v1)
 ; CHECK-NOT: add i64 {{.+}}, 40
  %v2 = load i32, i32* %casted, align 4
  call void @foo(i32 %v2)
  br label %fallthrough
 fallthrough:
  ret void
 }
 ; Can we still sink addressing mode if there's a cold use of the
 ; address itself?  
 define void @test4(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test4
 entry:
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
 ; CHECK-LABEL: if.then:
 ; CHECK: add i64 {{.+}}, 40
  %v1 = load i32, i32* %casted, align 4
  call void @foo(i32 %v1)
  %cmp = icmp eq i32 %v1, 0
  br i1 %cmp, label %rare.1, label %fallthrough
 fallthrough:
  ret void
 rare.1:
 ; CHECK-LABEL: rare.1:
 ; CHECK: add i64 {{.+}}, 40
  call void @slowpath(i32 %v1, i32* %casted) cold
  br label %fallthrough
 }
 ; Negative test - don't want to duplicate addressing into hot path
 define void @test5(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test5
 entry:
 ; CHECK: %addr = getelementptr
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
 ; CHECK-LABEL: if.then:
 ; CHECK-NOT: add i64 {{.+}}, 40
  %v1 = load i32, i32* %casted, align 4
  call void @foo(i32 %v1)
  %cmp = icmp eq i32 %v1, 0
  br i1 %cmp, label %rare.1, label %fallthrough
 fallthrough:
  ret void
 rare.1:
  call void @slowpath(i32 %v1, i32* %casted) ;; NOT COLD
  br label %fallthrough
 }
 ; Negative test - opt for size
 define void @test6(i1 %cond, i64* %base) minsize {
 ; CHECK-LABEL: @test6
 entry:
 ; CHECK: %addr = getelementptr
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
 ; CHECK-LABEL: if.then:
 ; CHECK-NOT: add i64 {{.+}}, 40
  %v1 = load i32, i32* %casted, align 4
  call void @foo(i32 %v1)
  %cmp = icmp eq i32 %v1, 0
  br i1 %cmp, label %rare.1, label %fallthrough
 fallthrough:
  ret void
 rare.1:
  call void @slowpath(i32 %v1, i32* %casted) cold
  br label %fallthrough
 }
 ; Make sure sinking two copies of addressing mode into different blocks works
 ; when there are cold paths for each.
 define void @test7(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test7
 entry:
  %addr = getelementptr inbounds i64, i64* %base, i64 5
  %casted = bitcast i64* %addr to i32*
  br i1 %cond, label %if.then, label %fallthrough
 if.then:
 ; CHECK-LABEL: if.then:
 ; CHECK: add i64 {{.+}}, 40
  %v1 = load i32, i32* %casted, align 4
  call void @foo(i32 %v1)
  %cmp = icmp eq i32 %v1, 0
  br i1 %cmp, label %rare.1, label %next
 next:
 ; CHECK-LABEL: next:
 ; CHECK: add i64 {{.+}}, 40
  %v2 = load i32, i32* %casted, align 4
  call void @foo(i32 %v2)
  %cmp2 = icmp eq i32 %v2, 0
  br i1 %cmp2, label %rare.1, label %fallthrough
 fallthrough:
  ret void
 rare.1:
 ; CHECK-LABEL: rare.1:
 ; CHECK: add i64 {{.+}}, 40
  call void @slowpath(i32 %v1, i32* %casted) cold
  br label %next
 rare.2:
 ; CHECK-LABEL: rare.2:
 ; CHECK: add i64 {{.+}}, 40
  call void @slowpath(i32 %v2, i32* %casted) cold
  br label %fallthrough
 }
 declare void @slowpath(i32, i32*)