[AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization

Frame-base materialization may insert vector instructions before EXEC is initialised. Fix this by moving lowering of llvm.amdgcn.init.exec later in backend. Also remove SI_INIT_EXEC_LO pseudo as this is not necessary. Reviewed By: ruiling Differential Revision: https://reviews.llvm.org/D94645
2025-01-31 20:51:52 +01:00 · 2021-01-25 08:31:08 +09:00 · 2021-01-25 08:31:08 +09:00 · 8b5995e559
commit 8b5995e559
parent f6cb3fe42b
5 changed files with 207 additions and 90 deletions
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[],
 // Set EXEC according to a thread count packed in an SGPR input:
 //    thread_count = (input >> bitoffset) & 0x7f;
 // This is always moved to the beginning of the basic block.
 // Note: only inreg arguments to the parent function are valid as
 // inputs to this intrinsic, computed values cannot be used.
 def int_amdgcn_init_exec_from_input : Intrinsic<[],
  [llvm_i32_ty,       // 32-bit SGPR input
   llvm_i32_ty],      // bit offset of the thread count
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -4021,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    MI.eraseFromParent();
    return BB;
  }
  case AMDGPU::SI_INIT_EXEC:
    // This should be before all vector instructions.
    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
            AMDGPU::EXEC)
        .addImm(MI.getOperand(0).getImm());
    MI.eraseFromParent();
    return BB;
  case AMDGPU::SI_INIT_EXEC_LO:
    // This should be before all vector instructions.
    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
            AMDGPU::EXEC_LO)
        .addImm(MI.getOperand(0).getImm());
    MI.eraseFromParent();
    return BB;
  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
    // Extract the thread count from an SGPR input and set EXEC accordingly.
    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
    //
    // S_BFE_U32 count, input, {shift, 7}
    // S_BFM_B64 exec, count, 0
    // S_CMP_EQ_U32 count, 64
    // S_CMOV_B64 exec, -1
    MachineInstr *FirstMI = &*BB->begin();
    MachineRegisterInfo &MRI = MF->getRegInfo();
    Register InputReg = MI.getOperand(0).getReg();
    Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
    bool Found = false;
    // Move the COPY of the input reg to the beginning, so that we can use it.
    for (auto I = BB->begin(); I != &MI; I++) {
      if (I->getOpcode() != TargetOpcode::COPY ||
          I->getOperand(0).getReg() != InputReg)
        continue;
      if (I == FirstMI) {
        FirstMI = &*++BB->begin();
      } else {
        I->removeFromParent();
        BB->insert(FirstMI, &*I);
      }
      Found = true;
      break;
    }
    assert(Found);
    (void)Found;
    // This should be before all vector instructions.
    unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
    bool isWave32 = getSubtarget()->isWave32();
    unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
        .addReg(InputReg)
        .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
    BuildMI(*BB, FirstMI, DebugLoc(),
            TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
            Exec)
        .addReg(CountReg)
        .addImm(0);
    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
        .addReg(CountReg, RegState::Kill)
        .addImm(getSubtarget()->getWavefrontSize());
    BuildMI(*BB, FirstMI, DebugLoc(),
            TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
            Exec)
        .addImm(-1);
    MI.eraseFromParent();
    return BB;
  }
  case AMDGPU::GET_GROUPSTATICSIZE: {
    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -399,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
  (outs), (ins i64imm:$src),
  [(int_amdgcn_init_exec (i64 timm:$src))]> {
  let Defs = [EXEC];
  let usesCustomInserter = 1;
  let isAsCheapAsAMove = 1;
  let WaveSizePredicate = isWave64;
 }
 // FIXME: Intrinsic should be mangled for wave size.
 def SI_INIT_EXEC_LO : SPseudoInstSI <
  (outs), (ins i32imm:$src), []> {
  let Defs = [EXEC_LO];
  let usesCustomInserter = 1;
  let isAsCheapAsAMove = 1;
  let WaveSizePredicate = isWave32;
 }
 // FIXME: Wave32 version
 def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
  (outs), (ins SSrc_b32:$input, i32imm:$shift),
  [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
  let Defs = [EXEC];
  let usesCustomInserter = 1;
 }
 def : GCNPat <
  (int_amdgcn_init_exec timm:$src),
  (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
  let WaveSizePredicate = isWave32;
 }
 // Return for returning shaders to a shader variant epilog.
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@ -93,6 +93,8 @@ private:
  MachineBasicBlock *emitEndCf(MachineInstr &MI);
  void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
  void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                        SmallVectorImpl<MachineOperand> &Src) const;
@ -661,6 +663,90 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
  return SplitBB;
 }
 void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
                                       MachineInstr &MI) {
  MachineFunction &MF = *MBB->getParent();
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  bool IsWave32 = ST.isWave32();
  if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
    // This should be before all vector instructions.
    BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
            TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
        .addImm(MI.getOperand(0).getImm());
    if (LIS)
      LIS->RemoveMachineInstrFromMaps(MI);
    MI.eraseFromParent();
    return;
  }
  // Extract the thread count from an SGPR input and set EXEC accordingly.
  // Since BFM can't shift by 64, handle that case with CMP + CMOV.
  //
  // S_BFE_U32 count, input, {shift, 7}
  // S_BFM_B64 exec, count, 0
  // S_CMP_EQ_U32 count, 64
  // S_CMOV_B64 exec, -1
  Register InputReg = MI.getOperand(0).getReg();
  MachineInstr *FirstMI = &*MBB->begin();
  if (InputReg.isVirtual()) {
    MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
    assert(DefInstr && DefInstr->isCopy());
    if (DefInstr->getParent() == MBB) {
      if (DefInstr != FirstMI) {
        // If the `InputReg` is defined in current block, we also need to
        // move that instruction to the beginning of the block.
        DefInstr->removeFromParent();
        MBB->insert(FirstMI, DefInstr);
        if (LIS)
          LIS->handleMove(*DefInstr);
      } else {
        // If first instruction is definition then move pointer after it.
        FirstMI = &*std::next(FirstMI->getIterator());
      }
    }
  }
  // Insert instruction sequence at block beginning (before vector operations).
  const DebugLoc DL = MI.getDebugLoc();
  const unsigned WavefrontSize = ST.getWavefrontSize();
  const unsigned Mask = (WavefrontSize << 1) - 1;
  Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
  auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
                   .addReg(InputReg)
                   .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
  auto BfmMI =
      BuildMI(*MBB, FirstMI, DL,
              TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
          .addReg(CountReg)
          .addImm(0);
  auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
                   .addReg(CountReg, RegState::Kill)
                   .addImm(WavefrontSize);
  auto CmovMI =
      BuildMI(*MBB, FirstMI, DL,
              TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
              Exec)
          .addImm(-1);
  if (!LIS) {
    MI.eraseFromParent();
    return;
  }
  LIS->RemoveMachineInstrFromMaps(MI);
  MI.eraseFromParent();
  LIS->InsertMachineInstrInMaps(*BfeMI);
  LIS->InsertMachineInstrInMaps(*BfmMI);
  LIS->InsertMachineInstrInMaps(*CmpMI);
  LIS->InsertMachineInstrInMaps(*CmovMI);
  LIS->removeInterval(InputReg);
  LIS->createAndComputeVirtRegInterval(InputReg);
  LIS->createAndComputeVirtRegInterval(CountReg);
 }
 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
  auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
    auto *S = B->getNextNode();
@ -781,6 +867,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
          SplitMBB = process(MI);
        break;
      // FIXME: find a better place for this
      case AMDGPU::SI_INIT_EXEC:
      case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
        lowerInitExec(MBB, MI);
        if (LIS)
          LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
        break;
      default:
        break;
      }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll
@ -84,6 +84,117 @@ main_body:
  unreachable
 }
 ; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
 ; GCN-NOT: {{^}}v_
 ; GCN: s_mov_b64 exec, -1
 ; GCN: v_mov
 ; GCN: v_add
 define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
 main_body:
  %array0 = alloca [1024 x i32], align 16, addrspace(5)
  %array1 = alloca [20 x i32], align 16, addrspace(5)
  call void @llvm.amdgcn.init.exec(i64 -1)
  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
  store i32 %a, i32 addrspace(5)* %ptr0, align 4
  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
  store i32 %a, i32 addrspace(5)* %ptr1, align 4
  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
  store i32 %b, i32 addrspace(5)* %ptr2, align 4
  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
  %v5 = add i32 %v3, %v4
  %v = bitcast i32 %v5 to float
  ret float %v
 }
 ; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
 ; GCN-NOT: {{^}}v_
 ; GCN: s_bfe_u32 s2, s2, 0x70008
 ; GCN-NEXT: s_bfm_b64 exec, s2, 0
 ; GCN-NEXT: s_cmp_eq_u32 s2, 64
 ; GCN-NEXT: s_cmov_b64 exec, -1
 ; GCN: v_mov
 ; GCN: v_add
 define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
 main_body:
  %array0 = alloca [1024 x i32], align 16, addrspace(5)
  %array1 = alloca [20 x i32], align 16, addrspace(5)
  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
  store i32 %a, i32 addrspace(5)* %ptr0, align 4
  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
  store i32 %a, i32 addrspace(5)* %ptr1, align 4
  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
  store i32 %b, i32 addrspace(5)* %ptr2, align 4
  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
  %v5 = add i32 %v3, %v4
  %v = bitcast i32 %v5 to float
  ret float %v
 }
 ; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
 ; GCN-NOT: {{^}}v_
 ; GCN: %endif
 ; GCN: s_bfe_u32 s3, s2, 0x70008
 ; GCN-NEXT: s_bfm_b64 exec, s3, 0
 ; GCN-NEXT: s_cmp_eq_u32 s3, 64
 ; GCN-NEXT: s_cmov_b64 exec, -1
 ; GCN: v_mov
 ; GCN: v_add
 define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
 main_body:
  ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
  %array0 = alloca [1024 x i32], align 16, addrspace(5)
  %array1 = alloca [20 x i32], align 16, addrspace(5)
  %cc = icmp uge i32 %count, 32
  br i1 %cc, label %endif, label %if
 if:
  call void asm sideeffect "", ""()
  br label %endif
 endif:
  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
  store i32 %a, i32 addrspace(5)* %ptr0, align 4
  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
  store i32 %a, i32 addrspace(5)* %ptr1, align 4
  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
  store i32 %b, i32 addrspace(5)* %ptr2, align 4
  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
  %v5 = add i32 %v3, %v4
  %v6 = add i32 %v5, %count
  %v = bitcast i32 %v6 to float
  ret float %v
 }
 declare void @llvm.amdgcn.init.exec(i64) #1
 declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1