mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 11:02:59 +02:00
[CodeGenPrepare] limit overflow intrinsic matching to a single basic block (2nd try)
This is a subset of the original commit from rL359879 which was reverted because it could crash when using the 'RemovedInstructions' structure that enables delayed deletion of dead instructions. The motivating compile-time win does not require that change though. We should get most of that win from this change alone. Using/updating a dominator tree to match math overflow patterns may be very expensive in compile-time (because of the way CGP uses a DT), so just handle the single-block case. See post-commit thread for rL354298 for more details: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20190422/646276.html Differential Revision: https://reviews.llvm.org/D61075 llvm-svn: 359969
This commit is contained in:
parent
a6c0a07b42
commit
d3550a79e4
@ -1177,6 +1177,20 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
|
|||||||
bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
|
bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
|
||||||
CmpInst *Cmp,
|
CmpInst *Cmp,
|
||||||
Intrinsic::ID IID) {
|
Intrinsic::ID IID) {
|
||||||
|
if (BO->getParent() != Cmp->getParent()) {
|
||||||
|
// We used to use a dominator tree here to allow multi-block optimization.
|
||||||
|
// But that was problematic because:
|
||||||
|
// 1. It could cause a perf regression by hoisting the math op into the
|
||||||
|
// critical path.
|
||||||
|
// 2. It could cause a perf regression by creating a value that was live
|
||||||
|
// across multiple blocks and increasing register pressure.
|
||||||
|
// 3. Use of a dominator tree could cause large compile-time regression.
|
||||||
|
// This is because we recompute the DT on every change in the main CGP
|
||||||
|
// run-loop. The recomputing is probably unnecessary in many cases, so if
|
||||||
|
// that was fixed, using a DT here would be ok.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
|
// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
|
||||||
Value *Arg0 = BO->getOperand(0);
|
Value *Arg0 = BO->getOperand(0);
|
||||||
Value *Arg1 = BO->getOperand(1);
|
Value *Arg1 = BO->getOperand(1);
|
||||||
@ -1186,36 +1200,15 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
|
|||||||
Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
|
Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
|
||||||
}
|
}
|
||||||
|
|
||||||
Instruction *InsertPt;
|
// Insert at the first instruction of the pair.
|
||||||
if (BO->hasOneUse() && BO->user_back() == Cmp) {
|
Instruction *InsertPt = nullptr;
|
||||||
// If the math is only used by the compare, insert at the compare to keep
|
for (Instruction &Iter : *Cmp->getParent()) {
|
||||||
// the condition in the same block as its users. (CGP aggressively sinks
|
if (&Iter == BO || &Iter == Cmp) {
|
||||||
// compares to help out SDAG.)
|
InsertPt = &Iter;
|
||||||
InsertPt = Cmp;
|
break;
|
||||||
} else {
|
|
||||||
// The math and compare may be independent instructions. Check dominance to
|
|
||||||
// determine the insertion point for the intrinsic.
|
|
||||||
bool MathDominates = getDT(*BO->getFunction()).dominates(BO, Cmp);
|
|
||||||
if (!MathDominates && !getDT(*BO->getFunction()).dominates(Cmp, BO))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
BasicBlock *MathBB = BO->getParent(), *CmpBB = Cmp->getParent();
|
|
||||||
if (MathBB != CmpBB) {
|
|
||||||
// Avoid hoisting an extra op into a dominating block and creating a
|
|
||||||
// potentially longer critical path.
|
|
||||||
if (!MathDominates)
|
|
||||||
return false;
|
|
||||||
// Check that the insertion doesn't create a value that is live across
|
|
||||||
// more than two blocks, so to minimise the increase in register pressure.
|
|
||||||
BasicBlock *Dominator = MathDominates ? MathBB : CmpBB;
|
|
||||||
BasicBlock *Dominated = MathDominates ? CmpBB : MathBB;
|
|
||||||
auto Successors = successors(Dominator);
|
|
||||||
if (llvm::find(Successors, Dominated) == Successors.end())
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
InsertPt = MathDominates ? cast<Instruction>(BO) : cast<Instruction>(Cmp);
|
|
||||||
}
|
}
|
||||||
|
assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
|
||||||
|
|
||||||
IRBuilder<> Builder(InsertPt);
|
IRBuilder<> Builder(InsertPt);
|
||||||
Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
|
Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
|
||||||
|
@ -121,7 +121,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) {
|
|||||||
ret i1 %ov
|
ret i1 %ov
|
||||||
}
|
}
|
||||||
|
|
||||||
; Verify insertion point for multi-BB.
|
; This used to verify insertion point for multi-BB, but now we just bail out.
|
||||||
|
|
||||||
declare void @call(i1)
|
declare void @call(i1)
|
||||||
|
|
||||||
@ -131,14 +131,17 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) nounwi
|
|||||||
; CHECK-NEXT: testb $1, %cl
|
; CHECK-NEXT: testb $1, %cl
|
||||||
; CHECK-NEXT: je .LBB8_2
|
; CHECK-NEXT: je .LBB8_2
|
||||||
; CHECK-NEXT: # %bb.1: # %t
|
; CHECK-NEXT: # %bb.1: # %t
|
||||||
; CHECK-NEXT: subq %rsi, %rdi
|
; CHECK-NEXT: movq %rdi, %rax
|
||||||
; CHECK-NEXT: setb %al
|
; CHECK-NEXT: subq %rsi, %rax
|
||||||
; CHECK-NEXT: movq %rdi, (%rdx)
|
; CHECK-NEXT: movq %rax, (%rdx)
|
||||||
; CHECK-NEXT: testb $1, %cl
|
; CHECK-NEXT: testb $1, %cl
|
||||||
; CHECK-NEXT: jne .LBB8_3
|
; CHECK-NEXT: je .LBB8_2
|
||||||
|
; CHECK-NEXT: # %bb.3: # %end
|
||||||
|
; CHECK-NEXT: cmpq %rsi, %rdi
|
||||||
|
; CHECK-NEXT: setb %al
|
||||||
|
; CHECK-NEXT: retq
|
||||||
; CHECK-NEXT: .LBB8_2: # %f
|
; CHECK-NEXT: .LBB8_2: # %f
|
||||||
; CHECK-NEXT: movl %ecx, %eax
|
; CHECK-NEXT: movl %ecx, %eax
|
||||||
; CHECK-NEXT: .LBB8_3: # %end
|
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
br i1 %cond, label %t, label %f
|
br i1 %cond, label %t, label %f
|
||||||
|
@ -14,11 +14,10 @@ define i1 @PR41004(i32 %x, i32 %y, i32 %t1) {
|
|||||||
; CHECK-NEXT: br label [[SELECT_END]]
|
; CHECK-NEXT: br label [[SELECT_END]]
|
||||||
; CHECK: select.end:
|
; CHECK: select.end:
|
||||||
; CHECK-NEXT: [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ]
|
; CHECK-NEXT: [[MUL:%.*]] = phi i32 [ [[REM]], [[SELECT_TRUE_SINK]] ], [ 0, [[ENTRY:%.*]] ]
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[T1:%.*]], i32 1)
|
; CHECK-NEXT: [[NEG:%.*]] = add i32 [[T1:%.*]], -1
|
||||||
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0
|
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NEG]], [[MUL]]
|
||||||
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1
|
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[T1]], 0
|
||||||
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MATH]], [[MUL]]
|
; CHECK-NEXT: ret i1 [[TOBOOL]]
|
||||||
; CHECK-NEXT: ret i1 [[OV]]
|
|
||||||
;
|
;
|
||||||
entry:
|
entry:
|
||||||
%rem = srem i32 %x, 2
|
%rem = srem i32 %x, 2
|
||||||
|
@ -47,15 +47,16 @@ define i64 @uaddo3(i64 %a, i64 %b) nounwind ssp {
|
|||||||
ret i64 %Q
|
ret i64 %Q
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic.
|
||||||
|
|
||||||
define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
|
define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp {
|
||||||
; CHECK-LABEL: @uaddo4(
|
; CHECK-LABEL: @uaddo4(
|
||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
|
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[B:%.*]], [[A:%.*]]
|
||||||
; CHECK-NEXT: br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]]
|
; CHECK-NEXT: br i1 [[C:%.*]], label [[NEXT:%.*]], label [[EXIT:%.*]]
|
||||||
; CHECK: next:
|
; CHECK: next:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[B:%.*]], i64 [[A:%.*]])
|
; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[B]], [[ADD]]
|
||||||
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
|
; CHECK-NEXT: [[Q:%.*]] = select i1 [[TMP0]], i64 [[B]], i64 42
|
||||||
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
|
|
||||||
; CHECK-NEXT: [[Q:%.*]] = select i1 [[OV]], i64 [[B]], i64 42
|
|
||||||
; CHECK-NEXT: ret i64 [[Q]]
|
; CHECK-NEXT: ret i64 [[Q]]
|
||||||
; CHECK: exit:
|
; CHECK: exit:
|
||||||
; CHECK-NEXT: ret i64 0
|
; CHECK-NEXT: ret i64 0
|
||||||
@ -362,7 +363,7 @@ define i1 @usubo_ne_constant0_op1_i32(i32 %x, i32* %p) {
|
|||||||
ret i1 %ov
|
ret i1 %ov
|
||||||
}
|
}
|
||||||
|
|
||||||
; Verify insertion point for multi-BB.
|
; This used to verify insertion point for multi-BB, but now we just bail out.
|
||||||
|
|
||||||
declare void @call(i1)
|
declare void @call(i1)
|
||||||
|
|
||||||
@ -371,15 +372,14 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, i64* %p, i1 %cond) {
|
|||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
; CHECK-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
|
; CHECK-NEXT: br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
|
||||||
; CHECK: t:
|
; CHECK: t:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[X:%.*]], i64 [[Y:%.*]])
|
; CHECK-NEXT: [[S:%.*]] = sub i64 [[X:%.*]], [[Y:%.*]]
|
||||||
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
|
; CHECK-NEXT: store i64 [[S]], i64* [[P:%.*]]
|
||||||
; CHECK-NEXT: [[OV1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
|
|
||||||
; CHECK-NEXT: store i64 [[MATH]], i64* [[P:%.*]]
|
|
||||||
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[F]]
|
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[F]]
|
||||||
; CHECK: f:
|
; CHECK: f:
|
||||||
; CHECK-NEXT: ret i1 [[COND]]
|
; CHECK-NEXT: ret i1 [[COND]]
|
||||||
; CHECK: end:
|
; CHECK: end:
|
||||||
; CHECK-NEXT: ret i1 [[OV1]]
|
; CHECK-NEXT: [[OV:%.*]] = icmp ult i64 [[X]], [[Y]]
|
||||||
|
; CHECK-NEXT: ret i1 [[OV]]
|
||||||
;
|
;
|
||||||
entry:
|
entry:
|
||||||
br i1 %cond, label %t, label %f
|
br i1 %cond, label %t, label %f
|
||||||
@ -514,6 +514,26 @@ exit:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; This was crashing when trying to delay instruction removal/deletion.
|
||||||
|
|
||||||
|
declare i64 @llvm.objectsize.i64.p0i8(i8*, i1 immarg, i1 immarg, i1 immarg) #0
|
||||||
|
|
||||||
|
define hidden fastcc void @crash() {
|
||||||
|
; CHECK-LABEL: @crash(
|
||||||
|
; CHECK-NEXT: [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
|
||||||
|
; CHECK-NEXT: [[MATH:%.*]] = extractvalue { i64, i1 } [[TMP1]], 0
|
||||||
|
; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1
|
||||||
|
; CHECK-NEXT: [[T2:%.*]] = select i1 undef, i1 undef, i1 [[OV]]
|
||||||
|
; CHECK-NEXT: unreachable
|
||||||
|
;
|
||||||
|
%t0 = add i64 undef, undef
|
||||||
|
%t1 = icmp ult i64 %t0, undef
|
||||||
|
%t2 = select i1 undef, i1 undef, i1 %t1
|
||||||
|
%t3 = call i64 @llvm.objectsize.i64.p0i8(i8* nonnull undef, i1 false, i1 false, i1 false)
|
||||||
|
%t4 = icmp ugt i64 %t3, 7
|
||||||
|
unreachable
|
||||||
|
}
|
||||||
|
|
||||||
; Check that every instruction inserted by -codegenprepare has a debug location.
|
; Check that every instruction inserted by -codegenprepare has a debug location.
|
||||||
; DEBUG: CheckModuleDebugify: PASS
|
; DEBUG: CheckModuleDebugify: PASS
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user