mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[MBP] Use profile count to compute tail dup cost if it is available
Current tail duplication in machine block placement pass uses block frequency information in cost model. But frequency number has only relative meaning compared to other basic blocks in the same function. A large frequency number doesn't mean it is hot and a small frequency number doesn't mean it is cold. To overcome this problem, this patch uses profile count in cost model if it's available. So we can tail duplicate real hot basic blocks. Differential Revision: https://reviews.llvm.org/D83265
This commit is contained in:
parent
d9addf9fa8
commit
73f673d871
@ -177,6 +177,14 @@ static cl::opt<unsigned> TailDupPlacementPenalty(
|
||||
cl::init(2),
|
||||
cl::Hidden);
|
||||
|
||||
// Heuristic for tail duplication if profile count is used in cost model.
|
||||
static cl::opt<unsigned> TailDupProfilePercentThreshold(
|
||||
"tail-dup-profile-percent-threshold",
|
||||
cl::desc("If profile count information is used in tail duplication cost "
|
||||
"model, the gained fall through number from tail duplication "
|
||||
"should be at least this percent of hot count."),
|
||||
cl::init(50), cl::Hidden);
|
||||
|
||||
// Heuristic for triangle chains.
|
||||
static cl::opt<unsigned> TriangleChainCount(
|
||||
"triangle-chain-count",
|
||||
@ -377,6 +385,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
|
||||
/// Partial tail duplication threshold.
|
||||
BlockFrequency DupThreshold;
|
||||
|
||||
/// True: use block profile count to compute tail duplication cost.
|
||||
/// False: use block frequency to compute tail duplication cost.
|
||||
bool UseProfileCount;
|
||||
|
||||
/// Allocator and owner of BlockChain structures.
|
||||
///
|
||||
/// We build BlockChains lazily while processing the loop structure of
|
||||
@ -402,6 +414,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
|
||||
SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
|
||||
#endif
|
||||
|
||||
/// Get block profile count or frequency according to UseProfileCount.
|
||||
/// The return value is used to model tail duplication cost.
|
||||
BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
|
||||
if (UseProfileCount) {
|
||||
auto Count = MBFI->getMBFI().getBlockProfileCount(BB);
|
||||
if (Count)
|
||||
return *Count;
|
||||
else
|
||||
return 0;
|
||||
} else
|
||||
return MBFI->getBlockFreq(BB);
|
||||
}
|
||||
|
||||
/// Scale the DupThreshold according to basic block size.
|
||||
BlockFrequency scaleThreshold(MachineBasicBlock *BB);
|
||||
void initDupThreshold();
|
||||
@ -3120,7 +3145,7 @@ bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
|
||||
|
||||
// Compute the number of reduced taken branches if Pred falls through to BB
|
||||
// instead of another successor. Then compare it with threshold.
|
||||
BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
|
||||
BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
|
||||
BlockFrequency Gain = PredFreq * (BBProb - BestProb);
|
||||
return Gain > scaleThreshold(BB);
|
||||
}
|
||||
@ -3194,7 +3219,7 @@ void MachineBlockPlacement::findDuplicateCandidates(
|
||||
// it. But it can beneficially fall through to BB, and duplicate BB into other
|
||||
// predecessors.
|
||||
for (MachineBasicBlock *Pred : Preds) {
|
||||
BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
|
||||
BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
|
||||
|
||||
if (!TailDup.canTailDuplicate(BB, Pred)) {
|
||||
// BB can't be duplicated into Pred, but it is possible to be layout
|
||||
@ -3243,6 +3268,15 @@ void MachineBlockPlacement::initDupThreshold() {
|
||||
if (!F->getFunction().hasProfileData())
|
||||
return;
|
||||
|
||||
// We prefer to use prifile count.
|
||||
uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
|
||||
if (HotThreshold != UINT64_MAX) {
|
||||
UseProfileCount = true;
|
||||
DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
|
||||
return;
|
||||
}
|
||||
|
||||
// Profile count is not available, we can use block frequency instead.
|
||||
BlockFrequency MaxFreq = 0;
|
||||
for (MachineBasicBlock &MBB : *F) {
|
||||
BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
|
||||
@ -3250,10 +3284,9 @@ void MachineBlockPlacement::initDupThreshold() {
|
||||
MaxFreq = Freq;
|
||||
}
|
||||
|
||||
// FIXME: we may use profile count instead of frequency,
|
||||
// and need more fine tuning.
|
||||
BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
|
||||
DupThreshold = MaxFreq * ThresholdProb;
|
||||
UseProfileCount = false;
|
||||
}
|
||||
|
||||
bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
105
test/CodeGen/X86/dup-cost.ll
Normal file
105
test/CodeGen/X86/dup-cost.ll
Normal file
@ -0,0 +1,105 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
|
||||
|
||||
; Cold function, %dup should not be duplicated into predecessors.
|
||||
define i32 @cold(i32 %a, i32* %p, i32* %q) !prof !21 {
|
||||
; CHECK-LABEL: cold
|
||||
; CHECK: %entry
|
||||
; CHECK: %true1
|
||||
; CHECK: %dup
|
||||
; CHECK: %true2
|
||||
; CHECK: %false1
|
||||
; CHECK: %false2
|
||||
entry:
|
||||
%cond1 = icmp sgt i32 %a, 1
|
||||
br i1 %cond1, label %true1, label %false1, !prof !30
|
||||
|
||||
true1:
|
||||
%v1 = load i32, i32* %p, align 4
|
||||
%v2 = add i32 %v1, 2
|
||||
br label %dup
|
||||
|
||||
false1:
|
||||
%v3 = load i32, i32* %q, align 4
|
||||
%v4 = sub i32 %v3, 3
|
||||
br label %dup
|
||||
|
||||
dup:
|
||||
%v5 = phi i32 [%v2, %true1], [%v4, %false1]
|
||||
%cond2 = icmp sgt i32 %v5, 4
|
||||
br i1 %cond2, label %true2, label %false2, !prof !30
|
||||
|
||||
true2:
|
||||
%v6 = xor i32 %v5, %a
|
||||
br label %exit
|
||||
|
||||
false2:
|
||||
%v7 = and i32 %v5, %a
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
%v8 = phi i32 [%v6, %true2], [%v7, %false2]
|
||||
ret i32 %v8
|
||||
}
|
||||
|
||||
; Same code as previous function, but with hot profile count.
|
||||
; So %dup should be duplicated into predecessors.
|
||||
define i32 @hot(i32 %a, i32* %p, i32* %q) !prof !22 {
|
||||
; CHECK-LABEL: hot
|
||||
; CHECK: %entry
|
||||
; CHECK: %true1
|
||||
; CHECK: %false2
|
||||
; CHECK: %false1
|
||||
; CHECK: %true2
|
||||
entry:
|
||||
%cond1 = icmp sgt i32 %a, 1
|
||||
br i1 %cond1, label %true1, label %false1, !prof !30
|
||||
|
||||
true1:
|
||||
%v1 = load i32, i32* %p, align 4
|
||||
%v2 = add i32 %v1, 2
|
||||
br label %dup
|
||||
|
||||
false1:
|
||||
%v3 = load i32, i32* %q, align 4
|
||||
%v4 = sub i32 %v3, 3
|
||||
br label %dup
|
||||
|
||||
dup:
|
||||
%v5 = phi i32 [%v2, %true1], [%v4, %false1]
|
||||
%cond2 = icmp sgt i32 %v5, 4
|
||||
br i1 %cond2, label %true2, label %false2, !prof !30
|
||||
|
||||
true2:
|
||||
%v6 = xor i32 %v5, %a
|
||||
br label %exit
|
||||
|
||||
false2:
|
||||
%v7 = and i32 %v5, %a
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
%v8 = phi i32 [%v6, %true2], [%v7, %false2]
|
||||
ret i32 %v8
|
||||
}
|
||||
|
||||
|
||||
!llvm.module.flags = !{!1}
|
||||
!21 = !{!"function_entry_count", i64 10}
|
||||
!22 = !{!"function_entry_count", i64 400}
|
||||
|
||||
!30 = !{!"branch_weights", i32 1, i32 1}
|
||||
|
||||
!1 = !{i32 1, !"ProfileSummary", !2}
|
||||
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
|
||||
!3 = !{!"ProfileFormat", !"InstrProf"}
|
||||
!4 = !{!"TotalCount", i64 10000}
|
||||
!5 = !{!"MaxCount", i64 10}
|
||||
!6 = !{!"MaxInternalCount", i64 1}
|
||||
!7 = !{!"MaxFunctionCount", i64 1000}
|
||||
!8 = !{!"NumCounts", i64 3}
|
||||
!9 = !{!"NumFunctions", i64 3}
|
||||
!10 = !{!"DetailedSummary", !11}
|
||||
!11 = !{!12, !13, !14}
|
||||
!12 = !{i32 10000, i64 100, i32 1}
|
||||
!13 = !{i32 999000, i64 100, i32 1}
|
||||
!14 = !{i32 999999, i64 1, i32 2}
|
Loading…
x
Reference in New Issue
Block a user