mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[MCA] Support carry-over instructions for in-order processors
Instructions that have more uops than the processor's IssueWidth are issued in multiple cycles. The patch fixes PR49712. Differential Revision: https://reviews.llvm.org/D99339
This commit is contained in:
parent
124a82a5ee
commit
e73adcc6fa
@ -45,6 +45,11 @@ class InOrderIssueStage final : public Stage {
|
||||
InstRef StalledInst;
|
||||
unsigned StallCyclesLeft;
|
||||
|
||||
/// Instruction that is issued in more than 1 cycle.
|
||||
InstRef CarriedOver;
|
||||
/// Number of CarriedOver uops left to issue.
|
||||
unsigned CarryOver;
|
||||
|
||||
/// Number of instructions that can be issued in the current cycle.
|
||||
unsigned Bandwidth;
|
||||
|
||||
@ -67,6 +72,9 @@ class InOrderIssueStage final : public Stage {
|
||||
/// Update status of instructions from IssuedInst.
|
||||
void updateIssuedInst();
|
||||
|
||||
/// Continue to issue the CarriedOver instruction.
|
||||
void updateCarriedOver();
|
||||
|
||||
/// Retire instruction once it is executed.
|
||||
void retireInstruction(InstRef &IR);
|
||||
|
||||
@ -74,7 +82,8 @@ public:
|
||||
InOrderIssueStage(RegisterFile &PRF, const MCSchedModel &SM,
|
||||
const MCSubtargetInfo &STI)
|
||||
: SM(SM), STI(STI), PRF(PRF), RM(std::make_unique<ResourceManager>(SM)),
|
||||
NumIssued(0), StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {}
|
||||
NumIssued(0), StallCyclesLeft(0), CarryOver(0), Bandwidth(0),
|
||||
LastWriteBackCycle(0) {}
|
||||
|
||||
bool isAvailable(const InstRef &) const override;
|
||||
bool hasWorkToComplete() const override;
|
||||
|
@ -29,15 +29,19 @@ namespace llvm {
|
||||
namespace mca {
|
||||
|
||||
bool InOrderIssueStage::hasWorkToComplete() const {
|
||||
return !IssuedInst.empty() || StalledInst;
|
||||
return !IssuedInst.empty() || StalledInst || CarriedOver;
|
||||
}
|
||||
|
||||
bool InOrderIssueStage::isAvailable(const InstRef &IR) const {
|
||||
if (StalledInst || CarriedOver)
|
||||
return false;
|
||||
|
||||
const Instruction &Inst = *IR.getInstruction();
|
||||
unsigned NumMicroOps = Inst.getNumMicroOps();
|
||||
const InstrDesc &Desc = Inst.getDesc();
|
||||
|
||||
if (Bandwidth < NumMicroOps)
|
||||
bool ShouldCarryOver = NumMicroOps > SM.IssueWidth;
|
||||
if (Bandwidth < NumMicroOps && !ShouldCarryOver)
|
||||
return false;
|
||||
|
||||
// Instruction with BeginGroup must be the first instruction to be issued in a
|
||||
@ -247,15 +251,19 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) {
|
||||
}
|
||||
notifyInstructionIssue(IR, UsedResources, *this);
|
||||
|
||||
if (Desc.EndGroup) {
|
||||
bool ShouldCarryOver = NumMicroOps > Bandwidth;
|
||||
if (ShouldCarryOver) {
|
||||
CarryOver = NumMicroOps - Bandwidth;
|
||||
CarriedOver = IR;
|
||||
Bandwidth = 0;
|
||||
NumIssued += Bandwidth;
|
||||
LLVM_DEBUG(dbgs() << "[N] Carry over #" << IR << " \n");
|
||||
} else {
|
||||
assert(Bandwidth >= NumMicroOps);
|
||||
Bandwidth -= NumMicroOps;
|
||||
NumIssued += NumMicroOps;
|
||||
Bandwidth = Desc.EndGroup ? 0 : Bandwidth - NumMicroOps;
|
||||
}
|
||||
|
||||
IssuedInst.push_back(IR);
|
||||
NumIssued += NumMicroOps;
|
||||
|
||||
if (!IR.getInstruction()->getDesc().RetireOOO)
|
||||
LastWriteBackCycle = findLastWriteBackCycle(IR);
|
||||
@ -295,6 +303,32 @@ void InOrderIssueStage::updateIssuedInst() {
|
||||
IssuedInst.resize(IssuedInst.size() - NumExecuted);
|
||||
}
|
||||
|
||||
void InOrderIssueStage::updateCarriedOver() {
|
||||
if (!CarriedOver)
|
||||
return;
|
||||
|
||||
assert(!StalledInst && "A stalled instruction cannot be carried over.");
|
||||
|
||||
if (CarryOver > Bandwidth) {
|
||||
CarryOver -= Bandwidth;
|
||||
Bandwidth = 0;
|
||||
LLVM_DEBUG(dbgs() << "[N] Carry over (" << CarryOver << "uops left) #"
|
||||
<< CarriedOver << " \n");
|
||||
return;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "[N] Carry over (complete) #" << CarriedOver
|
||||
<< " \n");
|
||||
|
||||
if (CarriedOver.getInstruction()->getDesc().EndGroup)
|
||||
Bandwidth = 0;
|
||||
else
|
||||
Bandwidth -= CarryOver;
|
||||
|
||||
CarriedOver = InstRef();
|
||||
CarryOver = 0;
|
||||
}
|
||||
|
||||
void InOrderIssueStage::retireInstruction(InstRef &IR) {
|
||||
Instruction &IS = *IR.getInstruction();
|
||||
IS.retire();
|
||||
@ -319,6 +353,9 @@ llvm::Error InOrderIssueStage::cycleStart() {
|
||||
|
||||
updateIssuedInst();
|
||||
|
||||
// Continue to issue the instruction carried over from the previous cycle
|
||||
updateCarriedOver();
|
||||
|
||||
// Issue instructions scheduled for this cycle
|
||||
if (!StallCyclesLeft && StalledInst) {
|
||||
if (llvm::Error E = tryIssue(StalledInst, &StallCyclesLeft))
|
||||
|
83
test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s
Normal file
83
test/tools/llvm-mca/AArch64/Cortex/A53-carry-over.s
Normal file
@ -0,0 +1,83 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
|
||||
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a53 --timeline --iterations=1 < %s | FileCheck %s
|
||||
|
||||
ldp w3, w5, [x10], #4 // 2uop + 1uop carry over
|
||||
add w10, w11, w12
|
||||
add w13, w14, w15
|
||||
ldp w7, w8, [x11] // 2uop, no carry over
|
||||
add w16, w17, w18
|
||||
add w19, w20, w21
|
||||
|
||||
# CHECK: Iterations: 1
|
||||
# CHECK-NEXT: Instructions: 6
|
||||
# CHECK-NEXT: Total Cycles: 8
|
||||
# CHECK-NEXT: Total uOps: 9
|
||||
|
||||
# CHECK: Dispatch Width: 2
|
||||
# CHECK-NEXT: uOps Per Cycle: 1.13
|
||||
# CHECK-NEXT: IPC: 0.75
|
||||
# CHECK-NEXT: Block RThroughput: 4.5
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
# CHECK-NEXT: [1]: #uOps
|
||||
# CHECK-NEXT: [2]: Latency
|
||||
# CHECK-NEXT: [3]: RThroughput
|
||||
# CHECK-NEXT: [4]: MayLoad
|
||||
# CHECK-NEXT: [5]: MayStore
|
||||
# CHECK-NEXT: [6]: HasSideEffects (U)
|
||||
|
||||
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
|
||||
# CHECK-NEXT: 3 4 2.00 * ldp w3, w5, [x10], #4
|
||||
# CHECK-NEXT: 1 3 0.50 add w10, w11, w12
|
||||
# CHECK-NEXT: 1 3 0.50 add w13, w14, w15
|
||||
# CHECK-NEXT: 2 4 2.00 * ldp w7, w8, [x11]
|
||||
# CHECK-NEXT: 1 3 0.50 add w16, w17, w18
|
||||
# CHECK-NEXT: 1 3 0.50 add w19, w20, w21
|
||||
|
||||
# CHECK: Resources:
|
||||
# CHECK-NEXT: [0.0] - A53UnitALU
|
||||
# CHECK-NEXT: [0.1] - A53UnitALU
|
||||
# CHECK-NEXT: [1] - A53UnitB
|
||||
# CHECK-NEXT: [2] - A53UnitDiv
|
||||
# CHECK-NEXT: [3] - A53UnitFPALU
|
||||
# CHECK-NEXT: [4] - A53UnitFPMDS
|
||||
# CHECK-NEXT: [5] - A53UnitLdSt
|
||||
# CHECK-NEXT: [6] - A53UnitMAC
|
||||
|
||||
# CHECK: Resource pressure per iteration:
|
||||
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6]
|
||||
# CHECK-NEXT: 2.00 2.00 - - - - 4.00 -
|
||||
|
||||
# CHECK: Resource pressure by instruction:
|
||||
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] Instructions:
|
||||
# CHECK-NEXT: - - - - - - 2.00 - ldp w3, w5, [x10], #4
|
||||
# CHECK-NEXT: - 1.00 - - - - - - add w10, w11, w12
|
||||
# CHECK-NEXT: 1.00 - - - - - - - add w13, w14, w15
|
||||
# CHECK-NEXT: - - - - - - 2.00 - ldp w7, w8, [x11]
|
||||
# CHECK-NEXT: - 1.00 - - - - - - add w16, w17, w18
|
||||
# CHECK-NEXT: 1.00 - - - - - - - add w19, w20, w21
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: Index 01234567
|
||||
|
||||
# CHECK: [0,0] DeeeE. . ldp w3, w5, [x10], #4
|
||||
# CHECK-NEXT: [0,1] .DeeE. . add w10, w11, w12
|
||||
# CHECK-NEXT: [0,2] . DeeE . add w13, w14, w15
|
||||
# CHECK-NEXT: [0,3] . DeeeE ldp w7, w8, [x11]
|
||||
# CHECK-NEXT: [0,4] . DeeE add w16, w17, w18
|
||||
# CHECK-NEXT: [0,5] . DeeE add w19, w20, w21
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
|
||||
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
|
||||
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
|
||||
|
||||
# CHECK: [0] [1] [2] [3]
|
||||
# CHECK-NEXT: 0. 1 0.0 0.0 0.0 ldp w3, w5, [x10], #4
|
||||
# CHECK-NEXT: 1. 1 0.0 0.0 0.0 add w10, w11, w12
|
||||
# CHECK-NEXT: 2. 1 0.0 0.0 0.0 add w13, w14, w15
|
||||
# CHECK-NEXT: 3. 1 0.0 0.0 0.0 ldp w7, w8, [x11]
|
||||
# CHECK-NEXT: 4. 1 0.0 0.0 0.0 add w16, w17, w18
|
||||
# CHECK-NEXT: 5. 1 0.0 0.0 0.0 add w19, w20, w21
|
||||
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
|
@ -28,8 +28,7 @@ v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
v_ldexp_f64 v[2:3], v[2:3], v0
|
||||
|
||||
; FIXME: This instructions sends llvm-mca into an infinite loop
|
||||
;v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
|
||||
v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
|
||||
@ -41,14 +40,14 @@ v_rsq_f64 v[2:3], v[2:3]
|
||||
v_sqrt_f64 v[4:5], v[4:5]
|
||||
|
||||
# CHECK: Iterations: 1
|
||||
# CHECK-NEXT: Instructions: 27
|
||||
# CHECK-NEXT: Total Cycles: 204
|
||||
# CHECK-NEXT: Total uOps: 27
|
||||
# CHECK-NEXT: Instructions: 28
|
||||
# CHECK-NEXT: Total Cycles: 224
|
||||
# CHECK-NEXT: Total uOps: 29
|
||||
|
||||
# CHECK: Dispatch Width: 1
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.13
|
||||
# CHECK-NEXT: IPC: 0.13
|
||||
# CHECK-NEXT: Block RThroughput: 27.0
|
||||
# CHECK-NEXT: Block RThroughput: 29.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
# CHECK-NEXT: [1]: #uOps
|
||||
@ -80,6 +79,7 @@ v_sqrt_f64 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: 1 22 1.00 U v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 1 22 1.00 U v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 1 22 1.00 U v_ldexp_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: 2 22 2.00 U v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 1 22 1.00 U v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: 1 22 1.00 U v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 1 22 1.00 U v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
|
||||
@ -98,7 +98,7 @@ v_sqrt_f64 v[4:5], v[4:5]
|
||||
|
||||
# CHECK: Resource pressure per iteration:
|
||||
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6]
|
||||
# CHECK-NEXT: - - - 27.00 - 27.00 -
|
||||
# CHECK-NEXT: - - - 29.00 1.00 28.00 -
|
||||
|
||||
# CHECK: Resource pressure by instruction:
|
||||
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] Instructions:
|
||||
@ -123,6 +123,7 @@ v_sqrt_f64 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 - v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 - v_ldexp_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: - - - 2.00 1.00 1.00 - v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 - v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
|
||||
# CHECK-NEXT: - - - 1.00 - 1.00 - v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
|
||||
@ -176,10 +177,11 @@ v_sqrt_f64 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: 18. 1 0.0 0.0 0.0 v_div_fmas_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 19. 1 0.0 0.0 0.0 v_div_fixup_f64 v[0:1], v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 20. 1 0.0 0.0 0.0 v_ldexp_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
|
||||
# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: 21. 1 0.0 0.0 0.0 v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 22. 1 0.0 0.0 0.0 v_trig_preop_f64 v[2:3], v[2:3], v0
|
||||
# CHECK-NEXT: 23. 1 0.0 0.0 0.0 v_cmp_eq_f64_e32 vcc_lo, v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 24. 1 0.0 0.0 0.0 v_cmp_class_f64_e64 vcc_lo, v[2:3], s0
|
||||
# CHECK-NEXT: 25. 1 0.0 0.0 0.0 v_rcp_f64_e32 v[0:1], v[0:1]
|
||||
# CHECK-NEXT: 26. 1 0.0 0.0 0.0 v_rsq_f64_e32 v[2:3], v[2:3]
|
||||
# CHECK-NEXT: 27. 1 0.0 0.0 0.0 v_sqrt_f64_e32 v[4:5], v[4:5]
|
||||
# CHECK-NEXT: 1 0.0 0.0 0.0 <total>
|
||||
|
Loading…
x
Reference in New Issue
Block a user