1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[Power9] Add addi post-ra scheduling heuristic

The instruction addi is usually used to post increase the loop indvar, which looks like this:

label_X:
 load x, base(i)
 ...
 y = op x
 ...
 i = addi i, 1
 goto label_X

However, for PowerPC, if there are too many vsx instructions that between y = op x and  i = addi i, 1,
it will use all the hw resource that block the execution of i = addi, i, 1, which result in the stall
of the load instruction in next iteration. So, a heuristic is added to move the addi as early as possible
to have the load hide the latency of vsx instructions, if other heuristic didn't apply to avoid the starve.

Reviewed By: jji

Differential Revision: https://reviews.llvm.org/D80269
This commit is contained in:
QingShan Zhang 2020-06-08 01:31:07 +00:00
parent 10c2d5387d
commit a619e90821
10 changed files with 228 additions and 28 deletions

View File

@ -1064,7 +1064,7 @@ public:
}
protected:
void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
virtual void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
void pickNodeFromQueue(SchedCandidate &Cand);
};

View File

@ -15,6 +15,16 @@ static cl::opt<bool>
DisableAddiLoadHeuristic("disable-ppc-sched-addi-load",
cl::desc("Disable scheduling addi instruction before"
"load for ppc"), cl::Hidden);
static cl::opt<bool>
EnableAddiHeuristic("ppc-postra-bias-addi",
cl::desc("Enable scheduling addi instruction as early"
"as possible post ra"),
cl::Hidden, cl::init(true));
static bool isADDIInstr(const GenericScheduler::SchedCandidate &Cand) {
return Cand.SU->getInstr()->getOpcode() == PPC::ADDI ||
Cand.SU->getInstr()->getOpcode() == PPC::ADDI8;
};
bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
@ -22,19 +32,13 @@ bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
if (DisableAddiLoadHeuristic)
return false;
auto isADDIInstr = [&] (const MachineInstr &Inst) {
return Inst.getOpcode() == PPC::ADDI || Inst.getOpcode() == PPC::ADDI8;
};
SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand;
SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand;
if (isADDIInstr(*FirstCand.SU->getInstr()) &&
SecondCand.SU->getInstr()->mayLoad()) {
if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) {
TryCand.Reason = Stall;
return true;
}
if (FirstCand.SU->getInstr()->mayLoad() &&
isADDIInstr(*SecondCand.SU->getInstr())) {
if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) {
TryCand.Reason = NoCand;
return true;
}
@ -61,6 +65,38 @@ void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return;
}
bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) const {
if (!EnableAddiHeuristic)
return false;
if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) {
TryCand.Reason = Stall;
return true;
}
return false;
}
void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand) {
PostGenericScheduler::tryCandidate(Cand, TryCand);
if (!Cand.isValid())
return;
// Add powerpc post ra specific heuristic only when TryCand isn't selected or
// selected as node order.
if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
return;
// There are some benefits to schedule the ADDI as early as possible post ra
// to avoid stalled by vector instructions which take up all the hw units.
// And ADDI is usually used to post inc the loop indvar, which matters the
// performance.
if (biasAddiCandidate(Cand, TryCand))
return;
}
void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
// Custom PPC PostRA specific behavior here.
PostGenericScheduler::enterMBB(MBB);

View File

@ -42,6 +42,9 @@ protected:
SUnit *pickNode(bool &IsTopNode) override;
void enterMBB(MachineBasicBlock *MBB) override;
void leaveMBB() override;
void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
bool biasAddiCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) const;
};
} // end namespace llvm

View File

@ -500,12 +500,12 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
; CHECK-NEXT: ld r0, -8(r6)
; CHECK-NEXT: add r29, r0, r29
; CHECK-NEXT: .LBB6_3: #
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: mulld r0, r29, r28
; CHECK-NEXT: mulld r0, r0, r30
; CHECK-NEXT: mulld r0, r0, r12
; CHECK-NEXT: mulld r0, r0, r11
; CHECK-NEXT: maddld r3, r0, r7, r3
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: bdz .LBB6_9
; CHECK-NEXT: .LBB6_4: #
; CHECK-NEXT: lbzu r0, 1(r5)

View File

@ -13,12 +13,12 @@ define void @f(i8*, i8*, i64*) {
; CHECK-NEXT: add 3, 3, 4
; CHECK-NEXT: li 4, 0
; CHECK-NEXT: .p2align 5
; CHECK-NEXT: .LBB0_2: #
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: addi 7, 4, 1
; CHECK-NEXT: sldi 6, 6, 4
; CHECK-NEXT: cmplwi 4, 14
; CHECK-NEXT: addi 7, 4, 1
; CHECK-NEXT: bc 12, 1, .LBB0_4
; CHECK-NEXT: # %bb.3: #
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: cmpd 3, 4
; CHECK-NEXT: mr 4, 7
; CHECK-NEXT: bc 4, 2, .LBB0_2

View File

@ -0,0 +1,161 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck -check-prefix=CHECK-P9 %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -ppc-postra-bias-addi=false < %s |\
; RUN: FileCheck -check-prefix=CHECK-P9-NO-HEURISTIC %s
%_type_of_scalars = type <{ [16 x i8], double, [152 x i8] }>
%_elem_type_of_x = type <{ double }>
%_elem_type_of_a = type <{ double }>
@scalars = common local_unnamed_addr global %_type_of_scalars zeroinitializer, align 16
define void @test([0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_a]* %.a, i64* noalias %.n) {
; CHECK-P9-LABEL: test:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: ld 5, 0(5)
; CHECK-P9-NEXT: addis 6, 2, scalars@toc@ha
; CHECK-P9-NEXT: addi 6, 6, scalars@toc@l
; CHECK-P9-NEXT: addi 6, 6, 16
; CHECK-P9-NEXT: rldicr 5, 5, 0, 58
; CHECK-P9-NEXT: addi 5, 5, -32
; CHECK-P9-NEXT: rldicl 5, 5, 59, 5
; CHECK-P9-NEXT: addi 5, 5, 1
; CHECK-P9-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NEXT: mtctr 5
; CHECK-P9-NEXT: .p2align 4
; CHECK-P9-NEXT: .LBB0_1: # %vector.body
; CHECK-P9-NEXT: #
; CHECK-P9-NEXT: lxv 1, 16(4)
; CHECK-P9-NEXT: lxv 2, 0(4)
; CHECK-P9-NEXT: lxv 3, 48(4)
; CHECK-P9-NEXT: lxv 4, 32(4)
; CHECK-P9-NEXT: xvmuldp 2, 2, 0
; CHECK-P9-NEXT: lxv 5, 240(4)
; CHECK-P9-NEXT: lxv 6, 224(4)
; CHECK-P9-NEXT: xvmuldp 1, 1, 0
; CHECK-P9-NEXT: xvmuldp 4, 4, 0
; CHECK-P9-NEXT: xvmuldp 3, 3, 0
; CHECK-P9-NEXT: xvmuldp 5, 5, 0
; CHECK-P9-NEXT: stxv 1, 16(3)
; CHECK-P9-NEXT: stxv 3, 48(3)
; CHECK-P9-NEXT: stxv 4, 32(3)
; CHECK-P9-NEXT: stxv 5, 240(3)
; CHECK-P9-NEXT: addi 4, 4, 256
; CHECK-P9-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NEXT: stxv 2, 0(3)
; CHECK-P9-NEXT: stxv 6, 224(3)
; CHECK-P9-NEXT: addi 3, 3, 256
; CHECK-P9-NEXT: bdnz .LBB0_1
; CHECK-P9-NEXT: # %bb.2: # %return.block
; CHECK-P9-NEXT: blr
;
; CHECK-P9-NO-HEURISTIC-LABEL: test:
; CHECK-P9-NO-HEURISTIC: # %bb.0: # %entry
; CHECK-P9-NO-HEURISTIC-NEXT: ld 5, 0(5)
; CHECK-P9-NO-HEURISTIC-NEXT: addis 6, 2, scalars@toc@ha
; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, scalars@toc@l
; CHECK-P9-NO-HEURISTIC-NEXT: rldicr 5, 5, 0, 58
; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, 16
; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, -32
; CHECK-P9-NO-HEURISTIC-NEXT: rldicl 5, 5, 59, 5
; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, 1
; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NO-HEURISTIC-NEXT: mtctr 5
; CHECK-P9-NO-HEURISTIC-NEXT: .p2align 4
; CHECK-P9-NO-HEURISTIC-NEXT: .LBB0_1: # %vector.body
; CHECK-P9-NO-HEURISTIC-NEXT: #
; CHECK-P9-NO-HEURISTIC-NEXT: lxv 1, 16(4)
; CHECK-P9-NO-HEURISTIC-NEXT: lxv 2, 0(4)
; CHECK-P9-NO-HEURISTIC-NEXT: lxv 3, 48(4)
; CHECK-P9-NO-HEURISTIC-NEXT: lxv 4, 32(4)
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 2, 2, 0
; CHECK-P9-NO-HEURISTIC-NEXT: lxv 5, 240(4)
; CHECK-P9-NO-HEURISTIC-NEXT: lxv 6, 224(4)
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 1, 1, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 4, 4, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 3, 3, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 5, 5, 0
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 1, 16(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 2, 0(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 3, 48(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 4, 32(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 5, 240(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 6, 224(3)
; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256
; CHECK-P9-NO-HEURISTIC-NEXT: addi 3, 3, 256
; CHECK-P9-NO-HEURISTIC-NEXT: bdnz .LBB0_1
; CHECK-P9-NO-HEURISTIC-NEXT: # %bb.2: # %return.block
; CHECK-P9-NO-HEURISTIC-NEXT: blr
entry:
%x_rvo_based_addr_3 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1
%a_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_a], [0 x %_elem_type_of_a]* %.a, i64 0, i64 -1
%_val_n_ = load i64, i64* %.n, align 8
%_val_c1_ = load double, double* getelementptr inbounds (%_type_of_scalars, %_type_of_scalars* @scalars, i64 0, i32 1), align 16
%n.vec = and i64 %_val_n_, -32
%broadcast.splatinsert26 = insertelement <4 x double> undef, double %_val_c1_, i32 0
%broadcast.splat27 = shufflevector <4 x double> %broadcast.splatinsert26, <4 x double> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%offset.idx = or i64 %index, 1
%0 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_3, i64 %offset.idx, i32 0
%1 = getelementptr %_elem_type_of_a, %_elem_type_of_a* %a_rvo_based_addr_5, i64 %offset.idx, i32 0
%2 = bitcast double* %1 to <4 x double>*
%wide.load = load <4 x double>, <4 x double>* %2, align 8
%3 = getelementptr double, double* %1, i64 4
%4 = bitcast double* %3 to <4 x double>*
%wide.load19 = load <4 x double>, <4 x double>* %4, align 8
%5 = getelementptr double, double* %1, i64 8
%6 = bitcast double* %5 to <4 x double>*
%wide.load20 = load <4 x double>, <4 x double>* %6, align 8
%7 = getelementptr double, double* %1, i64 12
%8 = bitcast double* %7 to <4 x double>*
%wide.load21 = load <4 x double>, <4 x double>* %8, align 8
%9 = getelementptr double, double* %1, i64 16
%10 = bitcast double* %9 to <4 x double>*
%wide.load22 = load <4 x double>, <4 x double>* %10, align 8
%11 = getelementptr double, double* %1, i64 20
%12 = bitcast double* %11 to <4 x double>*
%wide.load23 = load <4 x double>, <4 x double>* %12, align 8
%13 = getelementptr double, double* %1, i64 24
%14 = bitcast double* %13 to <4 x double>*
%wide.load24 = load <4 x double>, <4 x double>* %14, align 8
%15 = getelementptr double, double* %1, i64 28
%16 = bitcast double* %15 to <4 x double>*
%wide.load25 = load <4 x double>, <4 x double>* %16, align 8
%17 = fmul fast <4 x double> %wide.load, %broadcast.splat27
%18 = fmul fast <4 x double> %wide.load19, %broadcast.splat27
%19 = fmul fast <4 x double> %wide.load20, %broadcast.splat27
%20 = fmul fast <4 x double> %wide.load21, %broadcast.splat27
%21 = fmul fast <4 x double> %wide.load22, %broadcast.splat27
%22 = fmul fast <4 x double> %wide.load23, %broadcast.splat27
%23 = fmul fast <4 x double> %wide.load24, %broadcast.splat27
%24 = fmul fast <4 x double> %wide.load25, %broadcast.splat27
%25 = bitcast double* %0 to <4 x double>*
store <4 x double> %17, <4 x double>* %25, align 8
%26 = getelementptr double, double* %0, i64 4
%27 = bitcast double* %26 to <4 x double>*
store <4 x double> %18, <4 x double>* %27, align 8
%28 = getelementptr double, double* %0, i64 8
%29 = bitcast double* %28 to <4 x double>*
%30 = getelementptr double, double* %0, i64 12
%31 = bitcast double* %30 to <4 x double>*
%32 = getelementptr double, double* %0, i64 16
%33 = bitcast double* %32 to <4 x double>*
%34 = getelementptr double, double* %0, i64 20
%35 = bitcast double* %34 to <4 x double>*
%36 = getelementptr double, double* %0, i64 24
%37 = bitcast double* %36 to <4 x double>*
%38 = getelementptr double, double* %0, i64 28
%39 = bitcast double* %38 to <4 x double>*
store <4 x double> %24, <4 x double>* %39, align 8
%index.next = add i64 %index, 32
%cm = icmp eq i64 %index.next, %n.vec
br i1 %cm, label %return.block, label %vector.body
return.block:
ret void
}

View File

@ -22,35 +22,35 @@ define void @print_res() nounwind {
; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: li 4, 0
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: li 7, -1
; CHECK-NEXT: mtctr 3
; CHECK-NEXT: lbz 5, 0(5)
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: li 7, -1
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: lbz 5, 0(5)
; CHECK-NEXT: bdz .LBB0_6
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: addi 8, 7, -1
; CHECK-NEXT: xori 6, 5, 84
; CHECK-NEXT: clrldi 5, 7, 32
; CHECK-NEXT: lbz 5, 0(5)
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: addi 8, 7, -1
; CHECK-NEXT: bdz .LBB0_5
; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: cntlzw 6, 6
; CHECK-NEXT: srwi 7, 6, 5
; CHECK-NEXT: xori 6, 5, 84
; CHECK-NEXT: clrldi 5, 8, 32
; CHECK-NEXT: addi 8, 8, -1
; CHECK-NEXT: lbz 5, 0(5)
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: bdz .LBB0_4
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: clrldi 10, 8, 32
; CHECK-NEXT: addi 8, 8, -1
; CHECK-NEXT: cntlzw 9, 6
; CHECK-NEXT: xori 6, 5, 84
; CHECK-NEXT: lbz 5, 0(10)
; CHECK-NEXT: addi 8, 8, -1
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: add 4, 4, 7
; CHECK-NEXT: srwi 7, 9, 5
; CHECK-NEXT: bdnz .LBB0_3

View File

@ -14,20 +14,20 @@ define void @main() nounwind #0 {
; CHECK-NEXT: mr 30, 3
; CHECK-NEXT: bl calloc
; CHECK-NEXT: nop
; CHECK-NEXT: clrldi 4, 30, 32
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: addi 3, 3, -4
; CHECK-NEXT: li 6, 1
; CHECK-NEXT: clrldi 4, 30, 32
; CHECK-NEXT: mtctr 4
; CHECK-NEXT: mullw 4, 5, 5
; CHECK-NEXT: li 6, 1
; CHECK-NEXT: bdz .LBB0_3
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: addi 5, 6, 1
; CHECK-NEXT: stwu 4, 4(3)
; CHECK-NEXT: mullw 4, 6, 6
; CHECK-NEXT: addi 5, 6, 1
; CHECK-NEXT: bdz .LBB0_3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: #
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: stwu 4, 4(3)
; CHECK-NEXT: mullw 4, 5, 5
; CHECK-NEXT: addi 5, 5, 1

View File

@ -11,12 +11,12 @@ define dso_local i32* @foo() local_unnamed_addr {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, x@toc@ha
; CHECK-NEXT: addi r5, r5, x@toc@l
; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: addis r6, r2, y@toc@ha
; CHECK-NEXT: li r7, 340
; CHECK-NEXT: addi r3, r6, y@toc@l
; CHECK-NEXT: lwz r6, y@toc@l(r6)
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: lwzu r7, 12(r5)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 4(r5)

View File

@ -46,11 +46,11 @@ define i16 @test2elt(i64 %a.coerce) local_unnamed_addr #0 {
; CHECK-P9-NEXT: mtfprd f1, r3
; CHECK-P9-NEXT: mffprwz r3, f0
; CHECK-P9-NEXT: mtfprd f0, r3
; CHECK-P9-NEXT: addi r3, r1, -2
; CHECK-P9-NEXT: xxswapd v2, vs1
; CHECK-P9-NEXT: xxswapd v3, vs0
; CHECK-P9-NEXT: vmrglb v2, v3, v2
; CHECK-P9-NEXT: vsldoi v2, v2, v2, 8
; CHECK-P9-NEXT: addi r3, r1, -2
; CHECK-P9-NEXT: stxsihx v2, 0, r3
; CHECK-P9-NEXT: lhz r3, -2(r1)
; CHECK-P9-NEXT: blr
@ -764,11 +764,11 @@ define i16 @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 {
; CHECK-P9-NEXT: mtfprd f1, r3
; CHECK-P9-NEXT: mffprwz r3, f0
; CHECK-P9-NEXT: mtfprd f0, r3
; CHECK-P9-NEXT: addi r3, r1, -2
; CHECK-P9-NEXT: xxswapd v2, vs1
; CHECK-P9-NEXT: xxswapd v3, vs0
; CHECK-P9-NEXT: vmrglb v2, v3, v2
; CHECK-P9-NEXT: vsldoi v2, v2, v2, 8
; CHECK-P9-NEXT: addi r3, r1, -2
; CHECK-P9-NEXT: stxsihx v2, 0, r3
; CHECK-P9-NEXT: lhz r3, -2(r1)
; CHECK-P9-NEXT: blr