Revert "[AMDGPU] Insert PS early exit at end of control flow"

This reverts commit 2bfcacf0ad362956277a1c2c9ba00ddc453a42ce. There appears to be an issue to analysis preservation.
2025-01-31 20:51:52 +01:00 · 2020-07-03 13:03:33 +09:00 · 2020-07-03 13:03:33 +09:00 · 5eef7b47fb
commit 5eef7b47fb
parent 5e4157d196
5 changed files with 58 additions and 159 deletions
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@ -447,15 +447,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
        break;
      }

-      case AMDGPU::SI_KILL_CLEANUP:
-        if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
-            dominatesAllReachable(MBB)) {
-          KillInstrs.push_back(&MI);
-        } else {
-          MI.eraseFromParent();
-        }
-        break;
-
      default:
        break;
      }
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -379,9 +379,6 @@ multiclass PseudoInstKill <dag ins> {
 defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;

-let Defs = [EXEC] in
-def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
-
 let Defs = [EXEC,VCC] in
 def SI_ILLEGAL_COPY : SPseudoInstSI <
  (outs unknown:$dst), (ins unknown:$src),
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@ -89,10 +89,8 @@ private:
  MachineRegisterInfo *MRI = nullptr;
  SetVector<MachineInstr*> LoweredEndCf;
  DenseSet<Register> LoweredIf;
-  SmallSet<MachineInstr *, 16> NeedsKillCleanup;

  const TargetRegisterClass *BoolRC = nullptr;
-  bool InsertKillCleanups;
  unsigned AndOpc;
  unsigned OrOpc;
  unsigned XorOpc;
@ -113,8 +111,6 @@ private:

  void combineMasks(MachineInstr &MI);

-  void process(MachineInstr &MI);
-
  // Skip to the next instruction, ignoring debug instructions, and trivial
  // block boundaries (blocks that have one (typically fallthrough) successor,
  // and the successor has one predecessor.
@ -164,28 +160,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {

 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;

-static bool hasKill(const MachineBasicBlock *Begin,
-                    const MachineBasicBlock *End, const SIInstrInfo *TII) {
-  DenseSet<const MachineBasicBlock*> Visited;
-  SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
-                                               Begin->succ_end());
-
-  while (!Worklist.empty()) {
-    MachineBasicBlock *MBB = Worklist.pop_back_val();
-
-    if (MBB == End || !Visited.insert(MBB).second)
-      continue;
-    for (auto &Term : MBB->terminators())
-      if (TII->isKillTerminator(Term.getOpcode()))
-        return true;
-
-    Worklist.append(MBB->succ_begin(), MBB->succ_end());
-  }
-
-  return false;
-}
-
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
+                       const SIInstrInfo *TII) {
  Register SaveExecReg = MI.getOperand(0).getReg();
  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);

@ -194,6 +170,26 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
      U->getOpcode() != AMDGPU::SI_END_CF)
    return false;

+  // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+  // if there is any such terminator simplififcations are not safe.
+  auto SMBB = MI.getParent();
+  auto EMBB = U->getParent();
+  DenseSet<const MachineBasicBlock*> Visited;
+  SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
+                                              SMBB->succ_end());
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+
+    if (MBB == EMBB || !Visited.insert(MBB).second)
+      continue;
+    for(auto &Term : MBB->terminators())
+      if (TII->isKillTerminator(Term.getOpcode()))
+        return false;
+
+    Worklist.append(MBB->succ_begin(), MBB->succ_end());
+  }
+
  return true;
 }

@ -211,35 +207,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
  // If there is only one use of save exec register and that use is SI_END_CF,
  // we can optimize SI_IF by returning the full saved exec mask instead of
  // just cleared bits.
-  bool SimpleIf = isSimpleIf(MI, MRI);
-
-  if (InsertKillCleanups) {
-    // Check for SI_KILL_*_TERMINATOR on full path of control flow and
-    // flag the associated SI_END_CF for insertion of a kill cleanup.
-    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
-    while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
-      assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
-      assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
-      MachineOperand &NextExec = UseMI->getOperand(0);
-      Register NextExecReg = NextExec.getReg();
-      if (NextExec.isDead()) {
-        assert(!SimpleIf);
-        break;
-      }
-      UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
-    }
-    if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
-      if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
-        NeedsKillCleanup.insert(&*UseMI);
-        SimpleIf = false;
-      }
-    }
-  } else if (SimpleIf) {
-    // Check for SI_KILL_*_TERMINATOR on path from if to endif.
-    // if there is any such terminator simplifications are not safe.
-    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
-    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
-  }
+  bool SimpleIf = isSimpleIf(MI, MRI, TII);

  // Add an implicit def of exec to discourage scheduling VALU after this which
  // will interfere with trying to form s_and_saveexec_b64 later.
@ -459,8 +427,6 @@ SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(

    auto E = B->end();
    for ( ; It != E; ++It) {
-      if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
-        continue;
      if (TII->mayReadEXEC(*MRI, *It))
        break;
    }
@ -495,18 +461,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {

  LoweredEndCf.insert(NewMI);

-  // If this ends control flow which contains kills (as flagged in emitIf)
-  // then insert an SI_KILL_CLEANUP immediately following the exec mask
-  // manipulation.  This can be lowered to early termination if appropriate.
-  MachineInstr *CleanUpMI = nullptr;
-  if (NeedsKillCleanup.count(&MI))
-    CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
-
-  if (LIS) {
+  if (LIS)
    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
-    if (CleanUpMI)
-      LIS->InsertMachineInstrInMaps(*CleanUpMI);
-  }

  MI.eraseFromParent();

@ -597,56 +553,6 @@ void SILowerControlFlow::optimizeEndCf() {
  }
 }

-void SILowerControlFlow::process(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineBasicBlock::iterator I(MI);
-  MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
-
-  switch (MI.getOpcode()) {
-  case AMDGPU::SI_IF:
-    emitIf(MI);
-    break;
-
-  case AMDGPU::SI_ELSE:
-    emitElse(MI);
-    break;
-
-  case AMDGPU::SI_IF_BREAK:
-    emitIfBreak(MI);
-    break;
-
-  case AMDGPU::SI_LOOP:
-    emitLoop(MI);
-    break;
-
-  case AMDGPU::SI_END_CF:
-    emitEndCf(MI);
-    break;
-
-  default:
-    assert(false && "Attempt to process unsupported instruction");
-    break;
-  }
-
-  MachineBasicBlock::iterator Next;
-  for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
-    Next = std::next(I);
-    MachineInstr &MaskMI = *I;
-    switch (MaskMI.getOpcode()) {
-    case AMDGPU::S_AND_B64:
-    case AMDGPU::S_OR_B64:
-    case AMDGPU::S_AND_B32:
-    case AMDGPU::S_OR_B32:
-      // Cleanup bit manipulations on exec mask
-      combineMasks(MaskMI);
-      break;
-    default:
-      I = MBB.end();
-      break;
-    }
-  }
-}
-
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  TII = ST.getInstrInfo();
@ -656,8 +562,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
  LIS = getAnalysisIfAvailable<LiveIntervals>();
  MRI = &MF.getRegInfo();
  BoolRC = TRI->getBoolRC();
-  InsertKillCleanups =
-      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;

  if (ST.isWave32()) {
    AndOpc = AMDGPU::S_AND_B32;
@ -679,49 +583,62 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
    Exec = AMDGPU::EXEC;
  }

-  SmallVector<MachineInstr *, 32> Worklist;
-
  MachineFunction::iterator NextBB;
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
       BI != BE; BI = NextBB) {
    NextBB = std::next(BI);
    MachineBasicBlock &MBB = *BI;

-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    MachineBasicBlock::iterator I, Next, Last;
+
+    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      switch (MI.getOpcode()) {
      case AMDGPU::SI_IF:
-        process(MI);
+        emitIf(MI);
        break;

      case AMDGPU::SI_ELSE:
-      case AMDGPU::SI_IF_BREAK:
-      case AMDGPU::SI_LOOP:
-      case AMDGPU::SI_END_CF:
-        // Only build worklist if SI_IF instructions must be processed first.
-        if (InsertKillCleanups)
-          Worklist.push_back(&MI);
-        else
-          process(MI);
+        emitElse(MI);
        break;

+      case AMDGPU::SI_IF_BREAK:
+        emitIfBreak(MI);
+        break;
+
+      case AMDGPU::SI_LOOP:
+        emitLoop(MI);
+        break;
+
+      case AMDGPU::SI_END_CF:
+        emitEndCf(MI);
+        break;
+
+      case AMDGPU::S_AND_B64:
+      case AMDGPU::S_OR_B64:
+      case AMDGPU::S_AND_B32:
+      case AMDGPU::S_OR_B32:
+        // Cleanup bit manipulations on exec mask
+        combineMasks(MI);
+        Last = I;
+        continue;
+
      default:
-        break;
+        Last = I;
+        continue;
      }
+
+      // Replay newly inserted code to combine masks
+      Next = (Last == MBB.end()) ? MBB.begin() : Last;
    }
  }

-  for (MachineInstr *MI : Worklist)
-    process(*MI);
-
  optimizeEndCf();

  LoweredEndCf.clear();
  LoweredIf.clear();
-  NeedsKillCleanup.clear();

  return true;
 }
--- a/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@ -61,11 +61,9 @@ loop:
  br label %loop
 }

-; Check that the epilog is the final block
+; In case there's an epilog, we shouldn't have to do this.
 ; CHECK-LABEL: return_nonvoid
-; CHECK: exp null off, off, off, off done vm
-; CHECK-NEXT: s_endpgm
-; CHECK-NEXT: BB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK-NOT: exp null off, off, off, off done vm
 define amdgpu_ps float @return_nonvoid(float %0) #0 {
 main_body:
  %cmp = fcmp olt float %0, 1.000000e+01
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@ -470,11 +470,7 @@ bb9:                                              ; preds = %bb4
 }

 ; CHECK-LABEL: {{^}}cbranch_kill:
-; CHECK: ; %bb.{{[0-9]+}}: ; %export
-; CHECK-NEXT: s_or_b64
-; CHECK-NEXT: s_cbranch_execz [[EXIT:BB[0-9]+_[0-9]+]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: exp null off, off, off, off done vm
+; CHECK-NOT: exp null off, off, off, off done vm
 define amdgpu_ps void @cbranch_kill(i32 inreg %0, <2 x float> %1) {
 .entry:
  %val0 = extractelement <2 x float> %1, i32 0