1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[CGP] Duplicate addressing computation in cold paths if required to sink addressing mode

This patch teaches CGP to duplicate addressing mode computations into cold paths (detected via explicit cold attribute on calls) if required to let addressing mode be safely sunk into the basic block containing each load and store.

In general, duplicating code into cold blocks may result in code growth, but should not effect performance. In this case, it's better to duplicate some code than to put extra pressure on the register allocator by making it keep the address through the entirely of the fast path.

This patch only handles addressing computations, but in principal, we could implement a more general cold cold scheduling heuristic which tries to reduce register pressure in the fast path by duplicating code into the cold path. Getting the profitability of the general case right seemed likely to be challenging, so I stuck to the existing case (addressing computation) we already had.

Differential Revision: http://reviews.llvm.org/D17652

llvm-svn: 263074
This commit is contained in:
Philip Reames 2016-03-09 23:13:12 +00:00
parent 2413b9d203
commit e6c29ed949
2 changed files with 241 additions and 8 deletions

View File

@ -1760,6 +1760,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
} }
} }
// If we have a cold call site, try to sink addressing computation into the
// cold block. This interacts with our handling for loads and stores to
// ensure that we can fold all uses of a potential addressing computation
// into their uses. TODO: generalize this to work over profiling data
if (!OptSize && CI->hasFnAttr(Attribute::Cold))
for (auto &Arg : CI->arg_operands()) {
if (!Arg->getType()->isPointerTy())
continue;
unsigned AS = Arg->getType()->getPointerAddressSpace();
return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
}
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
if (II) { if (II) {
switch (II->getIntrinsicID()) { switch (II->getIntrinsicID()) {
@ -3443,6 +3455,8 @@ static bool FindAllMemoryUses(
if (!MightBeFoldableInst(I)) if (!MightBeFoldableInst(I))
return true; return true;
const bool OptSize = I->getFunction()->optForSize();
// Loop over all the uses, recursively processing them. // Loop over all the uses, recursively processing them.
for (Use &U : I->uses()) { for (Use &U : I->uses()) {
Instruction *UserI = cast<Instruction>(U.getUser()); Instruction *UserI = cast<Instruction>(U.getUser());
@ -3460,6 +3474,11 @@ static bool FindAllMemoryUses(
} }
if (CallInst *CI = dyn_cast<CallInst>(UserI)) { if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
// If this is a cold call, we can sink the addressing calculation into
// the cold path. See optimizeCallInst
if (!OptSize && CI->hasFnAttr(Attribute::Cold))
continue;
InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
if (!IA) return true; if (!IA) return true;
@ -3551,10 +3570,10 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
if (!BaseReg && !ScaledReg) if (!BaseReg && !ScaledReg)
return true; return true;
// If all uses of this instruction are ultimately load/store/inlineasm's, // If all uses of this instruction can have the address mode sunk into them,
// check to see if their addressing modes will include this instruction. If // we can remove the addressing mode and effectively trade one live register
// so, we can fold it into all uses, so it doesn't matter if it has multiple // for another (at worst.) In this context, folding an addressing mode into
// uses. // the use is just a particularly nice way of sinking it.
SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
SmallPtrSet<Instruction*, 16> ConsideredInsts; SmallPtrSet<Instruction*, 16> ConsideredInsts;
if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM)) if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
@ -3562,8 +3581,13 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
// Now that we know that all uses of this instruction are part of a chain of // Now that we know that all uses of this instruction are part of a chain of
// computation involving only operations that could theoretically be folded // computation involving only operations that could theoretically be folded
// into a memory use, loop over each of these uses and see if they could // into a memory use, loop over each of these memory operation uses and see
// *actually* fold the instruction. // if they could *actually* fold the instruction. The assumption is that
// addressing modes are cheap and that duplicating the computation involved
// many times is worthwhile, even on a fastpath. For sinking candidates
// (i.e. cold call sites), this serves as a way to prevent excessive code
// growth since most architectures have some reasonable small and fast way to
// compute an effective address. (i.e LEA on x86)
SmallVector<Instruction*, 32> MatchedAddrModeInsts; SmallVector<Instruction*, 32> MatchedAddrModeInsts;
for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
Instruction *User = MemoryUses[i].first; Instruction *User = MemoryUses[i].first;
@ -3617,6 +3641,11 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
return false; return false;
} }
/// Sink addressing mode computation immediate before MemoryInst if doing so
/// can be done without increasing register pressure. The need for the
/// register pressure constraint means this can end up being an all or nothing
/// decision for all uses of the same addressing computation.
///
/// Load and Store Instructions often have addressing modes that can do /// Load and Store Instructions often have addressing modes that can do
/// significant amounts of computation. As such, instruction selection will try /// significant amounts of computation. As such, instruction selection will try
/// to get the load or store to do as much computation as possible for the /// to get the load or store to do as much computation as possible for the
@ -3624,7 +3653,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
/// such, we sink as much legal addressing mode work into the block as possible. /// such, we sink as much legal addressing mode work into the block as possible.
/// ///
/// This method is used to optimize both load/store and inline asms with memory /// This method is used to optimize both load/store and inline asms with memory
/// operands. /// operands. It's also used to sink addressing computations feeding into cold
/// call sites into their (cold) basic block.
///
/// The motivation for handling sinking into cold blocks is that doing so can
/// both enable other address mode sinking (by satisfying the register pressure
/// constraint above), and reduce register pressure globally (by removing the
/// addressing mode computation from the fast path entirely.).
bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Type *AccessTy, unsigned AddrSpace) { Type *AccessTy, unsigned AddrSpace) {
Value *Repl = Addr; Value *Repl = Addr;
@ -3663,7 +3698,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
continue; continue;
} }
// For non-PHIs, determine the addressing mode being computed. // For non-PHIs, determine the addressing mode being computed. Note that
// the result may differ depending on what other uses our candidate
// addressing instructions might have.
SmallVector<Instruction*, 16> NewAddrModeInsts; SmallVector<Instruction*, 16> NewAddrModeInsts;
ExtAddrMode NewAddrMode = AddressingModeMatcher::Match( ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM, V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,

View File

@ -0,0 +1,196 @@
; RUN: opt -S -codegenprepare < %s | FileCheck %s
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
; Can we sink single addressing mode computation to use?
define void @test1(i1 %cond, i64* %base) {
; CHECK-LABEL: @test1
; CHECK: add i64 {{.+}}, 40
entry:
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
%v = load i32, i32* %casted, align 4
br label %fallthrough
fallthrough:
ret void
}
declare void @foo(i32)
; Make sure sinking two copies of addressing mode into different blocks works
define void @test2(i1 %cond, i64* %base) {
; CHECK-LABEL: @test2
entry:
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
; CHECK-LABEL: if.then:
; CHECK: add i64 {{.+}}, 40
%v1 = load i32, i32* %casted, align 4
call void @foo(i32 %v1)
%cmp = icmp eq i32 %v1, 0
br i1 %cmp, label %next, label %fallthrough
next:
; CHECK-LABEL: next:
; CHECK: add i64 {{.+}}, 40
%v2 = load i32, i32* %casted, align 4
call void @foo(i32 %v2)
br label %fallthrough
fallthrough:
ret void
}
; If we have two loads in the same block, only need one copy of addressing mode
; - instruction selection will duplicate if needed
define void @test3(i1 %cond, i64* %base) {
; CHECK-LABEL: @test3
entry:
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
; CHECK-LABEL: if.then:
; CHECK: add i64 {{.+}}, 40
%v1 = load i32, i32* %casted, align 4
call void @foo(i32 %v1)
; CHECK-NOT: add i64 {{.+}}, 40
%v2 = load i32, i32* %casted, align 4
call void @foo(i32 %v2)
br label %fallthrough
fallthrough:
ret void
}
; Can we still sink addressing mode if there's a cold use of the
; address itself?
define void @test4(i1 %cond, i64* %base) {
; CHECK-LABEL: @test4
entry:
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
; CHECK-LABEL: if.then:
; CHECK: add i64 {{.+}}, 40
%v1 = load i32, i32* %casted, align 4
call void @foo(i32 %v1)
%cmp = icmp eq i32 %v1, 0
br i1 %cmp, label %rare.1, label %fallthrough
fallthrough:
ret void
rare.1:
; CHECK-LABEL: rare.1:
; CHECK: add i64 {{.+}}, 40
call void @slowpath(i32 %v1, i32* %casted) cold
br label %fallthrough
}
; Negative test - don't want to duplicate addressing into hot path
define void @test5(i1 %cond, i64* %base) {
; CHECK-LABEL: @test5
entry:
; CHECK: %addr = getelementptr
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
; CHECK-LABEL: if.then:
; CHECK-NOT: add i64 {{.+}}, 40
%v1 = load i32, i32* %casted, align 4
call void @foo(i32 %v1)
%cmp = icmp eq i32 %v1, 0
br i1 %cmp, label %rare.1, label %fallthrough
fallthrough:
ret void
rare.1:
call void @slowpath(i32 %v1, i32* %casted) ;; NOT COLD
br label %fallthrough
}
; Negative test - opt for size
define void @test6(i1 %cond, i64* %base) minsize {
; CHECK-LABEL: @test6
entry:
; CHECK: %addr = getelementptr
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
; CHECK-LABEL: if.then:
; CHECK-NOT: add i64 {{.+}}, 40
%v1 = load i32, i32* %casted, align 4
call void @foo(i32 %v1)
%cmp = icmp eq i32 %v1, 0
br i1 %cmp, label %rare.1, label %fallthrough
fallthrough:
ret void
rare.1:
call void @slowpath(i32 %v1, i32* %casted) cold
br label %fallthrough
}
; Make sure sinking two copies of addressing mode into different blocks works
; when there are cold paths for each.
define void @test7(i1 %cond, i64* %base) {
; CHECK-LABEL: @test7
entry:
%addr = getelementptr inbounds i64, i64* %base, i64 5
%casted = bitcast i64* %addr to i32*
br i1 %cond, label %if.then, label %fallthrough
if.then:
; CHECK-LABEL: if.then:
; CHECK: add i64 {{.+}}, 40
%v1 = load i32, i32* %casted, align 4
call void @foo(i32 %v1)
%cmp = icmp eq i32 %v1, 0
br i1 %cmp, label %rare.1, label %next
next:
; CHECK-LABEL: next:
; CHECK: add i64 {{.+}}, 40
%v2 = load i32, i32* %casted, align 4
call void @foo(i32 %v2)
%cmp2 = icmp eq i32 %v2, 0
br i1 %cmp2, label %rare.1, label %fallthrough
fallthrough:
ret void
rare.1:
; CHECK-LABEL: rare.1:
; CHECK: add i64 {{.+}}, 40
call void @slowpath(i32 %v1, i32* %casted) cold
br label %next
rare.2:
; CHECK-LABEL: rare.2:
; CHECK: add i64 {{.+}}, 40
call void @slowpath(i32 %v2, i32* %casted) cold
br label %fallthrough
}
declare void @slowpath(i32, i32*)