[ModuloSchedule] Fix modulo expansion for data loop carried dependencies.

The new experimental expansion has a problem when a value has a data dependency with an instruction from a previous stage. This is due to the way we peel out the kernel. To fix that I'm changing the way we peel out the kernel. We now peel the kernel NumberStage - 1 times. The code would be correct at this point if we didn't have to handle cases where the loop iteration is smaller than the number of stages. To handle this case we move instructions between different epilogues based on their stage and remap the PHI instructions correctly. Differential Revision: https://reviews.llvm.org/D69538
2024-10-20 19:42:54 +02:00 · 2019-11-11 11:25:19 -08:00 · 2019-11-11 11:25:19 -08:00 · fd6dc2b3dc
commit fd6dc2b3dc
parent 96dab2e747
5 changed files with 195 additions and 21 deletions
--- a/include/llvm/CodeGen/ModuloSchedule.h
+++ b/include/llvm/CodeGen/ModuloSchedule.h
@ -299,6 +299,8 @@ class PeelingModuloScheduleExpander {

  /// State passed from peelKernel to peelPrologAndEpilogs().
  std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
+  /// Illegal phis that need to be deleted once we re-link stages.
+  SmallVector<MachineInstr *, 4> IllegalPhisToDelete;

 public:
  PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
@ -321,6 +323,13 @@ private:
  /// Peels one iteration of the rewritten kernel (BB) in the specified
  /// direction.
  MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
+  // Delete instructions whose stage is less than MinStage in the given basic
+  // block.
+  void filterInstructions(MachineBasicBlock *MB, int MinStage);
+  // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
+  // instructions to keep a valid IR.
+  void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
+                              MachineBasicBlock *SourceBB, unsigned Stage);
  /// Peel the kernel forwards and backwards to produce prologs and epilogs,
  /// and stitch them together.
  void peelPrologAndEpilogs();
--- a/lib/CodeGen/ModuloSchedule.cpp
+++ b/lib/CodeGen/ModuloSchedule.cpp
@ -1582,6 +1582,99 @@ PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
  return NewBB;
 }

+void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
+                                                       int MinStage) {
+  for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
+       I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
+    MachineInstr *MI = &*I++;
+    int Stage = getStage(MI);
+    if (Stage == -1 || Stage >= MinStage)
+      continue;
+
+    for (MachineOperand &DefMO : MI->defs()) {
+      SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+      for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+        // Only PHIs can use values from this block by construction.
+        // Match with the equivalent PHI in B.
+        assert(UseMI.isPHI());
+        Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                               MI->getParent());
+        Subs.emplace_back(&UseMI, Reg);
+      }
+      for (auto &Sub : Subs)
+        Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                      *MRI.getTargetRegisterInfo());
+    }
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+}
+
+void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
+    MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
+  auto InsertPt = DestBB->getFirstNonPHI();
+  DenseMap<Register, Register> Remaps;
+  for (auto I = SourceBB->getFirstNonPHI(); I != SourceBB->end();) {
+    MachineInstr *MI = &*I++;
+    if (MI->isPHI()) {
+      // This is an illegal PHI. If we move any instructions using an illegal
+      // PHI, we need to create a legal Phi
+      Register PhiR = MI->getOperand(0).getReg();
+      auto RC = MRI.getRegClass(PhiR);
+      Register NR = MRI.createVirtualRegister(RC);
+      MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(),
+                                 TII->get(TargetOpcode::PHI), NR)
+                             .addReg(PhiR)
+                             .addMBB(SourceBB);
+      BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
+      CanonicalMIs[NI] = CanonicalMIs[MI];
+      Remaps[PhiR] = NR;
+      continue;
+    }
+    if (getStage(MI) != Stage)
+      continue;
+    MI->removeFromParent();
+    DestBB->insert(InsertPt, MI);
+    auto *KernelMI = CanonicalMIs[MI];
+    BlockMIs[{DestBB, KernelMI}] = MI;
+    BlockMIs.erase({SourceBB, KernelMI});
+  }
+  SmallVector<MachineInstr *, 4> PhiToDelete;
+  for (MachineInstr &MI : DestBB->phis()) {
+    assert(MI.getNumOperands() == 3);
+    MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
+    // If the instruction referenced by the phi is moved inside the block
+    // we don't need the phi anymore.
+    if (getStage(Def) == Stage) {
+      Register PhiReg = MI.getOperand(0).getReg();
+      MRI.replaceRegWith(MI.getOperand(0).getReg(),
+                         Def->getOperand(0).getReg());
+      MI.getOperand(0).setReg(PhiReg);
+      PhiToDelete.push_back(&MI);
+    }
+  }
+  for (auto *P : PhiToDelete)
+    P->eraseFromParent();
+  InsertPt = DestBB->getFirstNonPHI();
+  for (MachineInstr &MI : SourceBB->phis()) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    DestBB->insert(InsertPt, NewMI);
+    Register OrigR = MI.getOperand(0).getReg();
+    Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+    NewMI->getOperand(0).setReg(R);
+    NewMI->getOperand(1).setReg(OrigR);
+    NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
+    Remaps[OrigR] = R;
+    CanonicalMIs[NewMI] = CanonicalMIs[&MI];
+    BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NewMI;
+  }
+  for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I)
+    for (MachineOperand &MO : I->uses())
+      if (MO.isReg() && Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+}
+
 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
  BitVector LS(Schedule.getNumStages(), true);
  BitVector AS(Schedule.getNumStages(), true);
@ -1607,23 +1700,36 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {

  // Push out the epilogs, again in reverse order.
  // We can't assume anything about the minumum loop trip count at this point,
-  // so emit a fairly complex epilog:
-  //  K[0, 1, 2]     // Kernel runs stages 0, 1, 2
-  //  E0[2] <- P1    // Epilog runs stage 2 only, so the state after is [0].
-  //  E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
-  //
-  // This creates a single-successor single-predecessor sequence of blocks for
-  // each epilog, which are kept this way for simplicity at this stage and
-  // cleaned up by the optimizer later.
+  // so emit a fairly complex epilog.
+
+  // We first peel number of stages minus one epilogue. Then we remove dead
+  // stages and reorder instructions based on their stage. If we have 3 stages
+  // we generate first:
+  // E0[3, 2, 1]
+  // E1[3', 2']
+  // E2[3'']
+  // And then we move instructions based on their stages to have:
+  // E0[3]
+  // E1[2, 3']
+  // E2[1, 2', 3'']
+  // The transformation is legal because we only move instructions past
+  // instructions of a previous loop iteration.
  for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
-    Epilogs.push_back(nullptr);
-    for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
-      LS.reset();
-      LS[J] = 1;
-      Epilogs.back() = peelKernel(LPD_Back);
-      LiveStages[Epilogs.back()] = LS;
-      AvailableStages[Epilogs.back()] = AS;
+    Epilogs.push_back(peelKernel(LPD_Back));
+    filterInstructions(Epilogs.back(), Schedule.getNumStages() - I);
+  }
+  for (size_t I = 0; I < Epilogs.size(); I++) {
+    LS.reset();
+    for (size_t J = I; J < Epilogs.size(); J++) {
+      int Iteration = J;
+      unsigned Stage = Schedule.getNumStages() - 1 + I - J;
+      // Move stage one block at a time so that Phi nodes are updated correctly.
+      for (size_t K = Iteration; K > I; K--)
+        moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
+      LS[Stage] = 1;
    }
+    LiveStages[Epilogs[I]] = LS;
+    AvailableStages[Epilogs[I]] = AS;
  }

  // Now we've defined all the prolog and epilog blocks as a fallthrough
@ -1659,6 +1765,13 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
      rewriteUsesOf(MI);
    }
  }
+  for (auto *MI : IllegalPhisToDelete) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+  IllegalPhisToDelete.clear();
+
  // Now all remapping has been done, we're free to optimize the generated code.
  for (MachineBasicBlock *B : reverse(Blocks))
    EliminateDeadPhis(B, MRI, LIS);
@ -1727,9 +1840,10 @@ void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
      R = MI->getOperand(1).getReg();
    MRI.setRegClass(R, MRI.getRegClass(PhiR));
    MRI.replaceRegWith(PhiR, R);
-    if (LIS)
-      LIS->RemoveMachineInstrFromMaps(*MI);
-    MI->eraseFromParent();
+    // Postpone deleting the Phi as it may be referenced by BlockMIs and used
+    // later to figure out how to remap registers.
+    MI->getOperand(0).setReg(PhiR);
+    IllegalPhisToDelete.push_back(MI);
    return;
  }

--- a/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
+++ b/test/CodeGen/Hexagon/swp-conv3x3-nested.ll
@ -1,6 +1,4 @@
 ; RUN: llc -march=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s
-; XFAIL: *
-; LSR changes required.

 ; This version of the conv3x3 test has both loops. This test checks that the
 ; inner loop has 13 packets.
--- a/test/CodeGen/Hexagon/swp-epilog-phi12.ll
+++ b/test/CodeGen/Hexagon/swp-epilog-phi12.ll
@ -0,0 +1,54 @@
+; RUN: llc -march=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true < %s | FileCheck %s
+
+; Test epilogue generation when reading loop-carried dependency from a previous
+; stage. The first epilogue should read value from iteration N-1 of the kernel.
+
+; CHECK: loop0
+; CHECK: r{{[0-9]+}} = add([[REG0:r([0-9]+)]],#8)
+; CHECK: [[REG0:r([0-9]+)]] = [[REG1:r([0-9]+)]]
+; CHECK: endloop0
+; CHECK: = add([[REG1]],#8)
+
+; Function Attrs: nounwind
+define i32* @f0(i16* nocapture readonly %a0, i32 %a1) #0 {
+b0:
+  %v0 = alloca [129 x i32], align 8
+  br i1 undef, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i16* [ %a0, %b1 ], [ %v2, %b2 ]
+  %v2 = phi i16* [ undef, %b1 ], [ %v15, %b2 ]
+  %v3 = phi i32* [ null, %b1 ], [ %v4, %b2 ]
+  %v4 = phi i32* [ null, %b1 ], [ %v14, %b2 ]
+  %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
+  %v6 = phi i16* [ undef, %b1 ], [ %v12, %b2 ]
+  %v7 = load i16, i16* %v2, align 2
+  %v8 = sext i16 %v7 to i32
+  %v9 = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %v8, i32 %v8) #2
+  %v10 = load i16, i16* %v6, align 2
+  %v11 = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %v9, i32 undef, i32 undef) #2
+  store i32 %v11, i32* %v4, align 4
+  %v12 = getelementptr inbounds i16, i16* %v6, i32 -1
+  %v13 = add i32 %v5, 1
+  %v14 = getelementptr inbounds i32, i32* %v3, i32 2
+  %v15 = getelementptr inbounds i16, i16* %v1, i32 2
+  %v16 = icmp slt i32 %v13, %a1
+  br i1 %v16, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
+  %out = phi i32* [ null, %b0 ], [ %v14, %b2 ]
+  ret i32* %out
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
--- a/test/CodeGen/Hexagon/swp-stages4.ll
+++ b/test/CodeGen/Hexagon/swp-stages4.ll
@ -5,7 +5,6 @@

 ; CHECK: = and
 ; CHECK: = and
-; CHECK: r[[REGA:[0-9]+]] = memub(r{{[0-9]+}}+#1)
 ; CHECK: r[[REG0:[0-9]+]] = and(r[[REG1:[0-9]+]],#255)
 ; CHECK-NOT: r[[REG0]] = and(r[[REG1]],#255)
 ; CHECK: loop0(.LBB0_[[LOOP:.]],