1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

MI-Sched: handle latency of in-order operations with the new machine model.

The per-operand machine model allows the target to define "unbuffered"
processor resources. This change is a quick, cheap way to model stalls
caused by the latency of operations that use such resources. This only
applies when the processor's micro-op buffer size is non-zero
(Out-of-Order). We can't precisely model in-order stalls during
out-of-order execution, but this is an easy and effective
heuristic. It benefits cortex-a9 scheduling when using the new
machine model, which is not yet on by default.

MI-Sched for armv7 was evaluated on Swift (and only not enabled because
of a performance bug related to predication). However, we never
evaluated Cortex-A9 performance on MI-Sched in its current form. This
change adds MI-Sched functionality to reach performance goals on
A9. The only remaining change is to allow MI-Sched to run as a PostRA
pass.

I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7:
-mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false

For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results:
(min run time over 2 runs, filtering tiny changes)

Speedups:
| Benchmarks/BenchmarkGame/recursive         |  52.39% |
| Benchmarks/VersaBench/beamformer           |  20.80% |
| Benchmarks/Misc/pi                         |  19.97% |
| Benchmarks/Misc/mandel-2                   |  19.95% |
| SPEC/CFP2000/188.ammp                      |  18.72% |
| Benchmarks/McCat/08-main/main              |  18.58% |
| Benchmarks/Misc-C++/Large/sphereflake      |  18.46% |
| Benchmarks/Olden/power                     |  17.11% |
| Benchmarks/Misc-C++/mandel-text            |  16.47% |
| Benchmarks/Misc/oourafft                   |  15.94% |
| Benchmarks/Misc/flops-7                    |  14.99% |
| Benchmarks/FreeBench/distray               |  14.26% |
| SPEC/CFP2006/470.lbm                       |  14.00% |
| mediabench/mpeg2/mpeg2dec/mpeg2decode      |  12.28% |
| Benchmarks/SmallPT/smallpt                 |  10.36% |
| Benchmarks/Misc-C++/Large/ray              |   8.97% |
| Benchmarks/Misc/fp-convert                 |   8.75% |
| Benchmarks/Olden/perimeter                 |   7.10% |
| Benchmarks/Bullet/bullet                   |   7.03% |
| Benchmarks/Misc/mandel                     |   6.75% |
| Benchmarks/Olden/voronoi                   |   6.26% |
| Benchmarks/Misc/flops-8                    |   5.77% |
| Benchmarks/Misc/matmul_f64_4x4             |   5.19% |
| Benchmarks/MiBench/security-rijndael       |   5.15% |
| Benchmarks/Misc/flops-6                    |   5.10% |
| Benchmarks/Olden/tsp                       |   4.46% |
| Benchmarks/MiBench/consumer-lame           |   4.28% |
| Benchmarks/Misc/flops-5                    |   4.27% |
| Benchmarks/mafft/pairlocalalign            |   4.19% |
| Benchmarks/Misc/himenobmtxpa               |   4.07% |
| Benchmarks/Misc/lowercase                  |   4.06% |
| SPEC/CFP2006/433.milc                      |   3.99% |
| Benchmarks/tramp3d-v4                      |   3.79% |
| Benchmarks/FreeBench/pifft                 |   3.66% |
| Benchmarks/Ptrdist/ks                      |   3.21% |
| Benchmarks/Adobe-C++/loop_unroll           |   3.12% |
| SPEC/CINT2000/175.vpr                      |   3.12% |
| Benchmarks/nbench                          |   2.98% |
| SPEC/CFP2000/183.equake                    |   2.91% |
| Benchmarks/Misc/perlin                     |   2.85% |
| Benchmarks/Misc/flops-1                    |   2.82% |
| Benchmarks/Misc-C++-EH/spirit              |   2.80% |
| Benchmarks/Misc/flops-2                    |   2.77% |
| Benchmarks/NPB-serial/is                   |   2.42% |
| Benchmarks/ASC_Sequoia/CrystalMk           |   2.33% |
| Benchmarks/BenchmarkGame/n-body            |   2.28% |
| Benchmarks/SciMark2-C/scimark2             |   2.27% |
| Benchmarks/Olden/bh                        |   2.03% |
| skidmarks10/skidmarks                      |   1.81% |
| Benchmarks/Misc/flops                      |   1.72% |

Slowdowns:
| Benchmarks/llubenchmark/llu                | -14.14% |
| Benchmarks/Polybench/stencils/seidel-2d    |  -5.67% |
| Benchmarks/Adobe-C++/functionobjects       |  -5.25% |
| Benchmarks/Misc-C++/oopack_v1p8            |  -5.00% |
| Benchmarks/Shootout/hash                   |  -2.35% |
| Benchmarks/Prolangs-C++/ocean              |  -2.01% |
| Benchmarks/Polybench/medley/floyd-warshall |  -1.98% |
| Polybench/linear-algebra/kernels/3mm       |  -1.95% |
| Benchmarks/McCat/09-vor/vor                |  -1.68% |

llvm-svn: 196516
This commit is contained in:
Andrew Trick 2013-12-05 17:55:58 +00:00
parent 24a9064bbd
commit 192311ab9a
5 changed files with 201 additions and 15 deletions

View File

@ -292,6 +292,7 @@ namespace llvm {
bool isScheduleHigh : 1; // True if preferable to schedule high.
bool isScheduleLow : 1; // True if preferable to schedule low.
bool isCloned : 1; // True if this node has been cloned.
bool isUnbuffered : 1; // Reads an unbuffered resource.
Sched::Preference SchedulingPref; // Scheduling preference.
private:
@ -316,9 +317,10 @@ namespace llvm {
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
isScheduleLow(false), isCloned(false), isUnbuffered(false),
SchedulingPref(Sched::None), isDepthCurrent(false),
isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
/// SUnit - Construct an SUnit for post-regalloc scheduling to represent
/// a MachineInstr.
@ -330,9 +332,10 @@ namespace llvm {
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
isScheduleLow(false), isCloned(false), isUnbuffered(false),
SchedulingPref(Sched::None), isDepthCurrent(false),
isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
/// SUnit - Construct a placeholder SUnit.
SUnit()
@ -343,9 +346,10 @@ namespace llvm {
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
isScheduleLow(false), isCloned(false), isUnbuffered(false),
SchedulingPref(Sched::None), isDepthCurrent(false),
isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
/// \brief Boundary nodes are placeholders for the boundary of the
/// scheduling region.

View File

@ -1330,7 +1330,7 @@ public:
/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason {
NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
@ -1583,6 +1583,10 @@ public:
MaxExecutedResCount);
}
/// Get the difference between the given SUnit's ready time and the current
/// cycle.
unsigned getLatencyStallCycles(SUnit *SU);
bool checkHazard(SUnit *SU);
unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
@ -1869,6 +1873,23 @@ void GenericScheduler::registerRoots() {
}
}
/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
/// these "soft stalls" differently than the hard stall cycles based on CPU
/// resources and computed by checkHazard(). A fully in-order model
/// (MicroOpBufferSize==0) will not make use of this since instructions are not
/// available for scheduling until they are ready. However, a weaker in-order
/// model may use this for heuristics. For example, if a processor has in-order
/// behavior when reading certain resources, this may come into play.
unsigned GenericScheduler::SchedBoundary::getLatencyStallCycles(SUnit *SU) {
if (!SU->isUnbuffered)
return 0;
unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
if (ReadyCycle > CurrCycle)
return ReadyCycle - CurrCycle;
return 0;
}
/// Does this SU have a hazard within the current instruction group.
///
/// The scheduler supports two modes of hazard recognition. The first is the
@ -1948,9 +1969,9 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
/// inside and outside the zone.
void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
SchedBoundary &OtherZone) {
// Now that potential stalls have been considered, apply preemptive heuristics
// based on the the total latency and resources inside and outside this
// zone.
// Apply preemptive heuristics based on the the total latency and resources
// inside and outside this zone. Potential stalls should be considered before
// following this policy.
// Compute remaining latency. We need this both to determine whether the
// overall schedule has become latency-limited and whether the instructions
@ -2141,7 +2162,11 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
break;
default:
// We don't currently model the OOO reorder buffer, so consider all
// scheduled MOps to be "retired".
// scheduled MOps to be "retired". We do loosely model in-order resource
// latency. If this instruction uses an in-order resource, account for any
// likely stall cycles.
if (SU->isUnbuffered && ReadyCycle > NextCycle)
NextCycle = ReadyCycle;
break;
}
RetiredMOps += IncMOps;
@ -2514,6 +2539,11 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
&& tryLatency(TryCand, Cand, Zone))
return;
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return;
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
@ -2577,6 +2607,7 @@ const char *GenericScheduler::getReasonStr(
case PhysRegCopy: return "PREG-COPY";
case RegExcess: return "REG-EXCESS";
case RegCritical: return "REG-CRIT ";
case Stall: return "STALL ";
case Cluster: return "CLUSTER ";
case Weak: return "WEAK ";
case RegMax: return "REG-MAX ";

View File

@ -687,6 +687,22 @@ void ScheduleDAGInstrs::initSUnits() {
// Assign the Latency field of SU using target-provided information.
SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
// If this SUnit uses an unbuffered resource, mark it as such.
// These resources are used for in-order execution pipelines within an
// out-of-order core and are identified by BufferSize=1. BufferSize=0 is
// used for dispatch/issue groups and is not considered here.
if (SchedModel.hasInstrSchedModel()) {
const MCSchedClassDesc *SC = getSchedClass(SU);
for (TargetSchedModel::ProcResIter
PI = SchedModel.getWriteProcResBegin(SC),
PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) {
SU->isUnbuffered = true;
break;
}
}
}
}
}

View File

@ -1905,7 +1905,7 @@ def A9UnitALU : ProcResource<2>;
def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
def A9UnitAGU : ProcResource<1>;
def A9UnitLS : ProcResource<1>;
def A9UnitFP : ProcResource<1> { let BufferSize = 0; }
def A9UnitFP : ProcResource<1> { let BufferSize = 1; }
def A9UnitB : ProcResource<1>;
//===----------------------------------------------------------------------===//

View File

@ -0,0 +1,135 @@
; RUN: llc < %s -march=arm -mtriple=thumbv7-apple-ios7.0.0 -float-abi=hard -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false | FileCheck %s
;
; Test MI-Sched suppory latency based stalls on in in-order pipeline
; using the new machine model.
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
; Don't be too strict with the top of the schedule, but most of it
; should be nicely pipelined.
;
; CHECK: saxpy10:
; CHECK: vldr
; CHECK: vldr
; CHECK: vldr
; CHECK: vldr
; CHECK: vldr
; CHECK: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vldr
; CHECK-NEXT: vmul
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vldr
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vadd
; CHECK-NEXT: vmov
; CHECK-NEXT: bx
;
; This accumulates a sum rather than storing each result.
define float @saxpy10(float* nocapture readonly %data1, float* nocapture readonly %data2, float %a) {
entry:
%0 = load float* %data1, align 4
%mul = fmul float %0, %a
%1 = load float* %data2, align 4
%add = fadd float %mul, %1
%add2 = fadd float %add, 0.000000e+00
%arrayidx.1 = getelementptr inbounds float* %data1, i32 1
%2 = load float* %arrayidx.1, align 4
%mul.1 = fmul float %2, %a
%arrayidx1.1 = getelementptr inbounds float* %data2, i32 1
%3 = load float* %arrayidx1.1, align 4
%add.1 = fadd float %mul.1, %3
%add2.1 = fadd float %add2, %add.1
%arrayidx.2 = getelementptr inbounds float* %data1, i32 2
%4 = load float* %arrayidx.2, align 4
%mul.2 = fmul float %4, %a
%arrayidx1.2 = getelementptr inbounds float* %data2, i32 2
%5 = load float* %arrayidx1.2, align 4
%add.2 = fadd float %mul.2, %5
%add2.2 = fadd float %add2.1, %add.2
%arrayidx.3 = getelementptr inbounds float* %data1, i32 3
%6 = load float* %arrayidx.3, align 4
%mul.3 = fmul float %6, %a
%arrayidx1.3 = getelementptr inbounds float* %data2, i32 3
%7 = load float* %arrayidx1.3, align 4
%add.3 = fadd float %mul.3, %7
%add2.3 = fadd float %add2.2, %add.3
%arrayidx.4 = getelementptr inbounds float* %data1, i32 4
%8 = load float* %arrayidx.4, align 4
%mul.4 = fmul float %8, %a
%arrayidx1.4 = getelementptr inbounds float* %data2, i32 4
%9 = load float* %arrayidx1.4, align 4
%add.4 = fadd float %mul.4, %9
%add2.4 = fadd float %add2.3, %add.4
%arrayidx.5 = getelementptr inbounds float* %data1, i32 5
%10 = load float* %arrayidx.5, align 4
%mul.5 = fmul float %10, %a
%arrayidx1.5 = getelementptr inbounds float* %data2, i32 5
%11 = load float* %arrayidx1.5, align 4
%add.5 = fadd float %mul.5, %11
%add2.5 = fadd float %add2.4, %add.5
%arrayidx.6 = getelementptr inbounds float* %data1, i32 6
%12 = load float* %arrayidx.6, align 4
%mul.6 = fmul float %12, %a
%arrayidx1.6 = getelementptr inbounds float* %data2, i32 6
%13 = load float* %arrayidx1.6, align 4
%add.6 = fadd float %mul.6, %13
%add2.6 = fadd float %add2.5, %add.6
%arrayidx.7 = getelementptr inbounds float* %data1, i32 7
%14 = load float* %arrayidx.7, align 4
%mul.7 = fmul float %14, %a
%arrayidx1.7 = getelementptr inbounds float* %data2, i32 7
%15 = load float* %arrayidx1.7, align 4
%add.7 = fadd float %mul.7, %15
%add2.7 = fadd float %add2.6, %add.7
%arrayidx.8 = getelementptr inbounds float* %data1, i32 8
%16 = load float* %arrayidx.8, align 4
%mul.8 = fmul float %16, %a
%arrayidx1.8 = getelementptr inbounds float* %data2, i32 8
%17 = load float* %arrayidx1.8, align 4
%add.8 = fadd float %mul.8, %17
%add2.8 = fadd float %add2.7, %add.8
%arrayidx.9 = getelementptr inbounds float* %data1, i32 9
%18 = load float* %arrayidx.9, align 4
%mul.9 = fmul float %18, %a
%arrayidx1.9 = getelementptr inbounds float* %data2, i32 9
%19 = load float* %arrayidx1.9, align 4
%add.9 = fadd float %mul.9, %19
%add2.9 = fadd float %add2.8, %add.9
ret float %add2.9
}