Improve machine schedulers for in-order processors

This patch enables schedulers to specify instructions that cannot be issued with any other instructions. It also fixes BeginGroup/EndGroup. Reviewed by: Andrew Trick Differential Revision: https://reviews.llvm.org/D30744 llvm-svn: 298885
2024-11-23 03:02:36 +01:00 · 2017-03-27 20:46:37 +00:00 · 2017-03-27 20:46:37 +00:00 · 3e2ba99436
commit 3e2ba99436
parent a7615b5302
7 changed files with 154 additions and 1 deletions
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@ -91,6 +91,13 @@ public:
  /// \brief Maximum number of micro-ops that may be scheduled per cycle.
  unsigned getIssueWidth() const { return SchedModel.IssueWidth; }

+  /// \brief Return true if new group must begin.
+  bool mustBeginGroup(const MachineInstr *MI,
+                          const MCSchedClassDesc *SC = nullptr) const;
+  /// \brief Return true if current group must end.
+  bool mustEndGroup(const MachineInstr *MI,
+                          const MCSchedClassDesc *SC = nullptr) const;
+
  /// \brief Return the number of issue slots required for this MI.
  unsigned getNumMicroOps(const MachineInstr *MI,
                          const MCSchedClassDesc *SC = nullptr) const;
@ -176,6 +183,7 @@ public:
                               bool UseDefaultDefLatency = true) const;
  unsigned computeInstrLatency(unsigned Opcode) const;

+
  /// \brief Output dependency latency of a pair of defs of the same register.
  ///
  /// This is typically one cycle.
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@ -255,6 +255,9 @@ class ProcWriteResources<list<ProcResourceKind> resources> {
  // Allow a processor to mark some scheduling classes as unsupported
  // for stronger verification.
  bit Unsupported = 0;
+  // Allow a processor to mark some scheduling classes as single-issue.
+  // SingleIssue is an alias for Begin/End Group.
+  bit SingleIssue = 0;
  SchedMachineModel SchedModel = ?;
 }

--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@ -1173,6 +1173,12 @@ void ScheduleDAGMILive::schedule() {
        dbgs() << "  Pressure Diff      : ";
        getPressureDiff(&SU).dump(*TRI);
      }
+      dbgs() << "  Single Issue       : ";
+      if (SchedModel.mustBeginGroup(SU.getInstr()) &&
+         SchedModel.mustEndGroup(SU.getInstr()))
+        dbgs() << "true;";
+      else
+        dbgs() << "false;";
      dbgs() << '\n';
    }
    if (ExitSU.getInstr() != nullptr)
@ -1910,12 +1916,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
      && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
    return true;
  }
+
  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
  if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
    return true;
  }
+
+  if (CurrMOps > 0 &&
+      ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
+       (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
+    DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
+                 << (isTop()? "begin" : "end") << " group\n");
+    return true;
+  }
+
  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
    for (TargetSchedModel::ProcResIter
@ -2211,6 +2227,18 @@ void SchedBoundary::bumpNode(SUnit *SU) {
  // one cycle.  Since we commonly reach the max MOps here, opportunistically
  // bump the cycle to avoid uselessly checking everything in the readyQ.
  CurrMOps += IncMOps;
+
+  // Bump the cycle count for issue group constraints.
+  // This must be done after NextCycle has been adjust for all other stalls.
+  // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
+  // currCycle to X.
+  if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
+      (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
+    DEBUG(dbgs() << "  Bump cycle to "
+                 << (isTop() ? "end" : "begin") << " group\n");
+    bumpCycle(++NextCycle);
+  }
+
  while (CurrMOps >= SchedModel->getIssueWidth()) {
    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
          << " at cycle " << CurrCycle << '\n');
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@ -84,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm,
  }
 }

+/// Returns true only if instruction is specified as single issue.
+bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->BeginGroup;
+  }
+  return false;
+}
+
+bool TargetSchedModel::mustEndGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->EndGroup;
+  }
+  return false;
+}
+
 unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
                                          const MCSchedClassDesc *SC) const {
  if (hasInstrItineraries()) {
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@ -74,7 +74,7 @@ def : WriteRes<WriteCMPsr, [R52UnitALU]> { let Latency = 0; }

 // Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2)
 def : WriteRes<WriteDIV, [R52UnitDiv]> {
-  let Latency = 8; let ResourceCycles = [8]; // not pipelined
+  let Latency = 8; let ResourceCycles = [8]; // non-pipelined
 }

 // Branches  - LR written in Late EX2
@ -717,16 +717,19 @@ def R52WriteVLD2Mem  : SchedWriteRes<[R52UnitLd]> {
  let Latency = 6;
  let NumMicroOps = 3;
  let ResourceCycles = [2];
+  let SingleIssue = 1;
 }
 def R52WriteVLD3Mem  : SchedWriteRes<[R52UnitLd]> {
  let Latency = 7;
  let NumMicroOps = 5;
  let ResourceCycles = [3];
+  let SingleIssue = 1;
 }
 def R52WriteVLD4Mem  : SchedWriteRes<[R52UnitLd]> {
  let Latency = 8;
  let NumMicroOps = 7;
  let ResourceCycles = [4];
+  let SingleIssue = 1;
 }
 def R52WriteVST1Mem  : SchedWriteRes<[R52UnitLd]> {
  let Latency = 5;
--- a/test/CodeGen/ARM/single-issue-r52.mir
+++ b/test/CodeGen/ARM/single-issue-r52.mir
@ -0,0 +1,86 @@
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
+# REQUIRES: asserts
+--- |
+  ; ModuleID = 'foo.ll'
+  source_filename = "foo.ll"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "arm---eabi"
+
+  %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+  ; Function Attrs: nounwind
+  define <8 x i8> @foo(i8* %A) {
+    %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8)
+    %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
+    %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 1
+    %tmp4 = add <8 x i8> %tmp2, %tmp3
+    ret <8 x i8> %tmp4
+  }
+  declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32)
+
+# CHECK: ********** MI Scheduling **********
+# CHECK: ScheduleDAGMILive::schedule starting
+# CHECK: SU(1):   %vreg1<def> = VLD4d8Pseudo %vreg0, 8, pred:14, pred:%noreg; mem:LD32[%A](align=8) QQPR:%vreg1 GPR:%vreg0
+# CHECK: Latency            : 8
+# CHECK: Single Issue       : true;
+# CHECK: SU(2):   %vreg4<def> = VADDv8i8 %vreg1:dsub_0, %vreg1:dsub_1, pred:14, pred:%noreg; DPR:%vreg4 QQPR:%vreg1
+# CHECK: Latency            : 5
+# CHECK: Single Issue       : false;
+# CHECK: SU(3):   %vreg5<def>, %vreg6<def> = VMOVRRD %vreg4, pred:14, pred:%noreg; GPR:%vreg5,%vreg6 DPR:%vreg4
+# CHECK: Latency            : 4
+# CHECK: Single Issue       : false;
+
+# TOPDOWN: Scheduling SU(1) %vreg1<def> = VLD4d8Pseudo
+# TOPDOWN: Bump cycle to end group
+# TOPDOWN: Scheduling SU(2) %vreg4<def> = VADDv8i8
+
+# BOTTOMUP: Scheduling SU(2) %vreg4<def> = VADDv8i8
+# BOTTOMUP: Scheduling SU(1) %vreg1<def> = VLD4d8Pseudo
+# BOTTOMUP: Bump cycle to begin group
+
+...
+---
+name:            foo
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: qqpr }
+  - { id: 2, class: dpr }
+  - { id: 3, class: dpr }
+  - { id: 4, class: dpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %r0
+
+    %0 = COPY %r0
+    %1 = VLD4d8Pseudo %0, 8, 14, _ :: (load 32 from %ir.A, align 8)
+    %4 = VADDv8i8 %1.dsub_0, %1.dsub_1, 14, _
+    %5, %6 = VMOVRRD %4, 14, _
+    %r0 = COPY %5
+    %r1 = COPY %6
+    BX_RET 14, _, implicit %r0, implicit killed %r1
+
+...
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@ -917,6 +917,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
        SCDesc.NumMicroOps += WriteRes->getValueAsInt("NumMicroOps");
        SCDesc.BeginGroup |= WriteRes->getValueAsBit("BeginGroup");
        SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup");
+        SCDesc.BeginGroup |= WriteRes->getValueAsBit("SingleIssue");
+        SCDesc.EndGroup |= WriteRes->getValueAsBit("SingleIssue");

        // Create an entry for each ProcResource listed in WriteRes.
        RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources");