1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[MCA] Support carry-over instructions for in-order processors

Instructions that have more uops than the processor's IssueWidth are
issued in multiple cycles.

The patch fixes PR49712.

Differential Revision: https://reviews.llvm.org/D99339
This commit is contained in:
Andrew Savonichev 2021-03-24 23:33:21 +03:00
parent 124a82a5ee
commit e73adcc6fa
4 changed files with 151 additions and 20 deletions

View File

@ -45,6 +45,11 @@ class InOrderIssueStage final : public Stage {
InstRef StalledInst;
unsigned StallCyclesLeft;
/// Instruction that is issued in more than 1 cycle.
InstRef CarriedOver;
/// Number of CarriedOver uops left to issue.
unsigned CarryOver;
/// Number of instructions that can be issued in the current cycle.
unsigned Bandwidth;
@ -67,6 +72,9 @@ class InOrderIssueStage final : public Stage {
/// Update status of instructions from IssuedInst.
void updateIssuedInst();
/// Continue to issue the CarriedOver instruction.
void updateCarriedOver();
/// Retire instruction once it is executed.
void retireInstruction(InstRef &IR);
@ -74,7 +82,8 @@ public:
InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM,
const MCSubtargetInfo &STI)
: SM(SM), STI(STI), PRF(PRF), RM(std::make_unique<ResourceManager>(SM)),
NumIssued(0), StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {}
NumIssued(0), StallCyclesLeft(0), CarryOver(0), Bandwidth(0),
LastWriteBackCycle(0) {}
bool isAvailable(const InstRef &) const override;
bool hasWorkToComplete() const override;

View File

@ -29,15 +29,19 @@ namespace llvm {
namespace mca {
bool InOrderIssueStage::hasWorkToComplete() const {
return !IssuedInst.empty() || StalledInst;
return !IssuedInst.empty() || StalledInst || CarriedOver;
}
bool InOrderIssueStage::isAvailable(const InstRef &IR) const {
if (StalledInst || CarriedOver)
return false;
const Instruction &Inst = *IR.getInstruction();
unsigned NumMicroOps = Inst.getNumMicroOps();
const InstrDesc &Desc = Inst.getDesc();
if (Bandwidth < NumMicroOps)
bool ShouldCarryOver = NumMicroOps > SM.IssueWidth;
if (Bandwidth < NumMicroOps && !ShouldCarryOver)
return false;
// Instruction with BeginGroup must be the first instruction to be issued in a
@ -247,15 +251,19 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
}
notifyInstructionIssue(IR, UsedResources, *this);
if (Desc.EndGroup) {
bool ShouldCarryOver = NumMicroOps > Bandwidth;
if (ShouldCarryOver) {
CarryOver = NumMicroOps - Bandwidth;
CarriedOver = IR;
Bandwidth = 0;
NumIssued += Bandwidth;
LLVM_DEBUG(dbgs() << "[N] Carry over #" << IR << " \n");
} else {
assert(Bandwidth >= NumMicroOps);
Bandwidth -= NumMicroOps;
NumIssued += NumMicroOps;
Bandwidth = Desc.EndGroup ? 0 : Bandwidth - NumMicroOps;
}
IssuedInst.push_back(IR);
NumIssued += NumMicroOps;
if (!IR.getInstruction()->getDesc().RetireOOO)
LastWriteBackCycle = findLastWriteBackCycle(IR);
@ -295,6 +303,32 @@ void InOrderIssueStage::updateIssuedInst() {
IssuedInst.resize(IssuedInst.size() - NumExecuted);
}
void InOrderIssueStage::updateCarriedOver() {
if (!CarriedOver)
return;
assert(!StalledInst && "A stalled instruction cannot be carried over.");
if (CarryOver > Bandwidth) {
CarryOver -= Bandwidth;
Bandwidth = 0;
LLVM_DEBUG(dbgs() << "[N] Carry over (" << CarryOver << "uops left) #"
<< CarriedOver << " \n");
return;
}
LLVM_DEBUG(dbgs() << "[N] Carry over (complete) #" << CarriedOver
<< " \n");
if (CarriedOver.getInstruction()->getDesc().EndGroup)
Bandwidth = 0;
else
Bandwidth -= CarryOver;
CarriedOver = InstRef();
CarryOver = 0;
}
void InOrderIssueStage::retireInstruction(InstRef &IR) {
Instruction &IS = *IR.getInstruction();
IS.retire();
@ -319,6 +353,9 @@ llvm::Error InOrderIssueStage::cycleStart() {
updateIssuedInst();
// Continue to issue the instruction carried over from the previous cycle
updateCarriedOver();
// Issue instructions scheduled for this cycle
if (!StallCyclesLeft && StalledInst) {
if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft))

View File

@ -0,0 +1,83 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a53 --timeline --iterations=1 < %s | FileCheck %s
ldp w3, w5, [x10], #4 // 2uop + 1uop carry over
add w10, w11, w12
add w13, w14, w15
ldp w7, w8, [x11] // 2uop, no carry over
add w16, w17, w18
add w19, w20, w21
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 6
# CHECK-NEXT: Total Cycles: 8
# CHECK-NEXT: Total uOps: 9
# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.13
# CHECK-NEXT: IPC: 0.75
# CHECK-NEXT: Block RThroughput: 4.5
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 3 4 2.00 * ldp w3, w5, [x10], #4
# CHECK-NEXT: 1 3 0.50 add w10, w11, w12
# CHECK-NEXT: 1 3 0.50 add w13, w14, w15
# CHECK-NEXT: 2 4 2.00 * ldp w7, w8, [x11]
# CHECK-NEXT: 1 3 0.50 add w16, w17, w18
# CHECK-NEXT: 1 3 0.50 add w19, w20, w21
# CHECK: Resources:
# CHECK-NEXT: [0.0] - A53UnitALU
# CHECK-NEXT: [0.1] - A53UnitALU
# CHECK-NEXT: [1] - A53UnitB
# CHECK-NEXT: [2] - A53UnitDiv
# CHECK-NEXT: [3] - A53UnitFPALU
# CHECK-NEXT: [4] - A53UnitFPMDS
# CHECK-NEXT: [5] - A53UnitLdSt
# CHECK-NEXT: [6] - A53UnitMAC
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6]
# CHECK-NEXT: 2.00 2.00 - - - - 4.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: - - - - - - 2.00 - ldp w3, w5, [x10], #4
# CHECK-NEXT: - 1.00 - - - - - - add w10, w11, w12
# CHECK-NEXT: 1.00 - - - - - - - add w13, w14, w15
# CHECK-NEXT: - - - - - - 2.00 - ldp w7, w8, [x11]
# CHECK-NEXT: - 1.00 - - - - - - add w16, w17, w18
# CHECK-NEXT: 1.00 - - - - - - - add w19, w20, w21
# CHECK: Timeline view:
# CHECK-NEXT: Index 01234567
# CHECK: [0,0] DeeeE. . ldp w3, w5, [x10], #4
# CHECK-NEXT: [0,1] .DeeE. . add w10, w11, w12
# CHECK-NEXT: [0,2] . DeeE . add w13, w14, w15
# CHECK-NEXT: [0,3] . DeeeE ldp w7, w8, [x11]
# CHECK-NEXT: [0,4] . DeeE add w16, w17, w18
# CHECK-NEXT: [0,5] . DeeE add w19, w20, w21
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 0.0 0.0 0.0 ldp w3, w5, [x10], #4
# CHECK-NEXT: 1. 1 0.0 0.0 0.0 add w10, w11, w12
# CHECK-NEXT: 2. 1 0.0 0.0 0.0 add w13, w14, w15
# CHECK-NEXT: 3. 1 0.0 0.0 0.0 ldp w7, w8, [x11]
# CHECK-NEXT: 4. 1 0.0 0.0 0.0 add w16, w17, w18
# CHECK-NEXT: 5. 1 0.0 0.0 0.0 add w19, w20, w21
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>

View File

@ -28,8 +28,7 @@ v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
v_ldexp_f64 v[2:3], v[2:3], v0
; FIXME: This instructions sends llvm-mca into an infinite loop
;v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
v_trig_preop_f64 v[2:3], v[2:3], v0
@ -41,14 +40,14 @@ v_rsq_f64 v[2:3], v[2:3]
v_sqrt_f64 v[4:5], v[4:5]
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 27
# CHECK-NEXT: Total Cycles: 204
# CHECK-NEXT: Total uOps: 27
# CHECK-NEXT: Instructions: 28
# CHECK-NEXT: Total Cycles: 224
# CHECK-NEXT: Total uOps: 29
# CHECK: Dispatch Width: 1
# CHECK-NEXT: uOps Per Cycle: 0.13
# CHECK-NEXT: IPC: 0.13
# CHECK-NEXT: Block RThroughput: 27.0
# CHECK-NEXT: Block RThroughput: 29.0
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
@ -80,6 +79,7 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK-NEXT: 1 22 1.00 U v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_ldexp_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: 2 22 2.00 U v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_trig_preop_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: 1 22 1.00 U v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
# CHECK-NEXT: 1 22 1.00 U v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
@ -98,7 +98,7 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
# CHECK-NEXT: - - - 27.00 - 27.00 -
# CHECK-NEXT: - - - 29.00 1.00 28.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
@ -123,6 +123,7 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_ldexp_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: - - - 2.00 1.00 1.00 - v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_trig_preop_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
# CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
@ -176,10 +177,11 @@ v_sqrt_f64 v[4:5], v[4:5]
# CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_ldexp_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1]
# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3]
# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5]
# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0
# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1]
# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3]
# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5]
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>