1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[ARM] Improve if-conversion for M-class CPUs without branch predictors

The current heuristic in isProfitableToIfCvt assumes we have a branch predictor,
and so gives the wrong answer in some cases when we don't. This patch adds a
subtarget feature to indicate that a subtarget has no branch predictor, and
changes the heuristic in isProfitableToiIfCvt when it's present. This gives a
slight overall improvement in a set of embedded benchmarks on Cortex-M4 and
Cortex-M33.

Differential Revision: https://reviews.llvm.org/D34398

llvm-svn: 306547
This commit is contained in:
John Brawn 2017-06-28 14:11:15 +00:00
parent ed751c8384
commit 3ad0317ae0
6 changed files with 240 additions and 15 deletions

View File

@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
"Has return address stack">;
// Some processors have no branch predictor, which changes the expected cost of
// taking a branch which affects the choice of whether to use predicated
// instructions.
def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
"HasBranchPredictor", "false",
"Has no branch predictor">;
/// DSP extension.
def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
"Supports DSP instructions in ARM and/or Thumb2">;
@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r,
FeatureHasSlowFPVMLx,
FeatureAvoidPartialCPSR]>;
def : ProcNoItin<"cortex-m3", [ARMv7m, ProcM3]>;
def : ProcNoItin<"sc300", [ARMv7m, ProcM3]>;
def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m,
ProcM3,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m4", [ARMv7em,
def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
ProcM3,
FeatureHasNoBranchPredictor]>;
def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
FeatureVFP4,
FeatureVFPOnlySP,
FeatureD16]>;
FeatureD16,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-m7", [ARMv7em,
FeatureFPARMv8,
@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7", [ARMv7em,
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
FeatureNoMovt]>;
def : ProcNoItin<"cortex-m33", [ARMv8mMainline,
def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
FeatureDSP,
FeatureFPARMv8,
FeatureD16,
FeatureVFPOnlySP]>;
FeatureVFPOnlySP,
FeatureHasNoBranchPredictor]>;
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,

View File

@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
}
bool ARMBaseInstrInfo::
isProfitableToIfCvt(MachineBasicBlock &,
isProfitableToIfCvt(MachineBasicBlock &TBB,
unsigned TCycles, unsigned TExtra,
MachineBasicBlock &,
MachineBasicBlock &FBB,
unsigned FCycles, unsigned FExtra,
BranchProbability Probability) const {
if (!TCycles)
@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
// Here we scale up each component of UnpredCost to avoid precision issue when
// scaling TCycles/FCycles by Probability.
const unsigned ScalingUpFactor = 1024;
unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
unsigned FUnpredCost =
Probability.getCompl().scale(FCycles * ScalingUpFactor);
unsigned UnpredCost = TUnpredCost + FUnpredCost;
UnpredCost += 1 * ScalingUpFactor; // The branch itself
UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
unsigned UnpredCost;
if (!Subtarget.hasBranchPredictor()) {
// When we don't have a branch predictor it's always cheaper to not take a
// branch than take it, so we have to take that into account.
unsigned NotTakenBranchCost = 1;
unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
unsigned TUnpredCycles, FUnpredCycles;
if (!FCycles) {
// Triangle: TBB is the fallthrough
TUnpredCycles = TCycles + NotTakenBranchCost;
FUnpredCycles = TakenBranchCost;
} else {
// Diamond: TBB is the block that is branched to, FBB is the fallthrough
TUnpredCycles = TCycles + TakenBranchCost;
FUnpredCycles = FCycles + NotTakenBranchCost;
}
// The total cost is the cost of each path scaled by their probabilites
unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
UnpredCost = TUnpredCost + FUnpredCost;
// When predicating assume that the first IT can be folded away but later
// ones cost one cycle each
if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
}
} else {
unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
unsigned FUnpredCost =
Probability.getCompl().scale(FCycles * ScalingUpFactor);
UnpredCost = TUnpredCost + FUnpredCost;
UnpredCost += 1 * ScalingUpFactor; // The branch itself
UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
}
return PredCost <= UnpredCost;
}
bool

View File

@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
include "ARMScheduleSwift.td"
include "ARMScheduleR52.td"
include "ARMScheduleA57.td"
include "ARMScheduleM3.td"

View File

@ -0,0 +1,21 @@
//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the machine model for the ARM Cortex-M3 processor.
//
//===----------------------------------------------------------------------===//
def CortexM3Model : SchedMachineModel {
let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue
let MicroOpBufferSize = 0; // In-order
let LoadLatency = 2; // Latency when not pipelined, not pc-relative
let MispredictPenalty = 2; // Best case branch taken cost
let CompleteModel = 0;
}

View File

@ -246,6 +246,11 @@ protected:
/// avoid issue "normal" call instructions to callees which do not return.
bool HasRetAddrStack = false;
/// HasBranchPredictor - True if the subtarget has a branch predictor. Having
/// a branch predictor or not changes the expected cost of taking a branch
/// which affects the choice of whether to use predicated instructions.
bool HasBranchPredictor = true;
/// HasMPExtension - True if the subtarget supports Multiprocessing
/// extension (ARMv7 only).
bool HasMPExtension = false;
@ -554,6 +559,7 @@ public:
bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
bool hasRetAddrStack() const { return HasRetAddrStack; }
bool hasBranchPredictor() const { return HasBranchPredictor; }
bool hasMPExtension() const { return HasMPExtension; }
bool hasDSP() const { return HasDSP; }
bool useNaClTrap() const { return UseNaClTrap; }

View File

@ -0,0 +1,154 @@
; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP
; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP
declare void @otherfn()
; CHECK-LABEL: triangle1:
; CHECK: itt ne
; CHECK: movne
; CHECK: strne
define i32 @triangle1(i32 %n, i32* %p) {
entry:
%tobool = icmp eq i32 %n, 0
br i1 %tobool, label %if.end, label %if.then
if.then:
store i32 1, i32* %p, align 4
br label %if.end
if.end:
tail call void @otherfn()
ret i32 0
}
; CHECK-LABEL: triangle2:
; CHECK-BP: itttt ne
; CHECK-BP: movne
; CHECK-BP: strne
; CHECK-BP: movne
; CHECK-BP: strne
; CHECK-NOBP: cbz
; CHECK-NOBP: movs
; CHECK-NOBP: str
; CHECK-NOBP: movs
; CHECK-NOBP: str
define i32 @triangle2(i32 %n, i32* %p, i32* %q) {
entry:
%tobool = icmp eq i32 %n, 0
br i1 %tobool, label %if.end, label %if.then
if.then:
store i32 1, i32* %p, align 4
store i32 2, i32* %q, align 4
br label %if.end
if.end:
tail call void @otherfn()
ret i32 0
}
; CHECK-LABEL: triangle3:
; CHECK: cbz
; CHECK: movs
; CHECK: str
; CHECK: movs
; CHECK: str
; CHECK: movs
; CHECK: str
define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) {
entry:
%tobool = icmp eq i32 %n, 0
br i1 %tobool, label %if.end, label %if.then
if.then:
store i32 1, i32* %p, align 4
store i32 2, i32* %q, align 4
store i32 3, i32* %r, align 4
br label %if.end
if.end:
tail call void @otherfn()
ret i32 0
}
; CHECK-LABEL: diamond1:
; CHECK: ite eq
; CHECK: ldreq
; CHECK: strne
define i32 @diamond1(i32 %n, i32* %p) {
entry:
%tobool = icmp eq i32 %n, 0
br i1 %tobool, label %if.else, label %if.then
if.then:
store i32 %n, i32* %p, align 4
br label %if.end
if.else:
%0 = load i32, i32* %p, align 4
br label %if.end
if.end:
%n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
tail call void @otherfn()
ret i32 %n.addr.0
}
; CHECK-LABEL: diamond2:
; CHECK-BP: itte
; CHECK-BP: streq
; CHECK-BP: ldreq
; CHECK-BP: strne
; CHECK-NOBP: cbz
; CHECK-NOBP: str
; CHECK-NOBP: b
; CHECK-NOBP: str
; CHECK-NOBP: ldr
define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
entry:
%tobool = icmp eq i32 %n, 0
br i1 %tobool, label %if.else, label %if.then
if.then:
store i32 %n, i32* %p, align 4
br label %if.end
if.else:
store i32 %m, i32* %q, align 4
%0 = load i32, i32* %p, align 4
br label %if.end
if.end:
%n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
tail call void @otherfn()
ret i32 %n.addr.0
}
; CHECK-LABEL: diamond3:
; CHECK: cbz
; CHECK: movs
; CHECK: str
; CHECK: b
; CHECK: ldr
; CHECK: ldr
; CHECK: adds
define i32 @diamond3(i32 %n, i32* %p, i32* %q) {
entry:
%tobool = icmp eq i32 %n, 0
br i1 %tobool, label %if.else, label %if.then
if.then:
store i32 1, i32* %p, align 4
br label %if.end
if.else:
%0 = load i32, i32* %p, align 4
%1 = load i32, i32* %q, align 4
%add = add nsw i32 %1, %0
br label %if.end
if.end:
%n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ]
tail call void @otherfn()
ret i32 %n.addr.0
}