From dc754bb13dd885c83063706c81f5bdf9ba279a5e Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 12 Nov 2018 18:48:17 +0000 Subject: [PATCH] [AMDGPU] Optimize S_CBRANCH_VCC[N]Z -> S_CBRANCH_EXEC[N]Z Sometimes after basic block placement we end up with a code like: sreg = s_mov_b64 -1 vcc = s_and_b64 exec, sreg s_cbranch_vccz This happens as a join of a block assigning -1 to a saved mask and another block which consumes that saved mask with s_and_b64 and a branch. This is essentially a single s_cbranch_execz instruction when moved into a single new basic block. Differential Revision: https://reviews.llvm.org/D54164 llvm-svn: 346690 --- lib/Target/AMDGPU/SIInsertSkips.cpp | 97 ++++++ test/CodeGen/AMDGPU/branch-relaxation.ll | 2 +- test/CodeGen/AMDGPU/infinite-loop.ll | 2 +- test/CodeGen/AMDGPU/insert-skip-from-vcc.mir | 320 +++++++++++++++++++ 4 files changed, 419 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/AMDGPU/insert-skip-from-vcc.mir diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index dc9397cf7b8..f23fa02bf8a 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -66,6 +66,8 @@ private: bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); + bool optimizeVccBranch(MachineInstr &MI) const; + public: static char ID; @@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, return true; } +bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { + // Match: + // sreg = -1 + // vcc = S_AND_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + bool Changed = false; + MachineBasicBlock &MBB = *MI.getParent(); + const unsigned CondReg = AMDGPU::VCC; + const unsigned ExecReg = AMDGPU::EXEC; + const unsigned And = AMDGPU::S_AND_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + for (++A ; A != E ; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, TRI)) + return false; + if (A->modifiesRegister(CondReg, TRI)) { + if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, TRI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + TII->commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && Op2.getImm() != -1) + return Changed; + + unsigned SReg = AMDGPU::NoRegister; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for ( ; M != E ; ++M) { + if (M->definesRegister(SReg, TRI)) + break; + if (M->modifiesRegister(SReg, TRI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, TRI); + } + if (M == E || + !M->isMoveImmediate() || + !M->getOperand(1).isImm() || + M->getOperand(1).getImm() != -1) + return Changed; + // First if sreg is only used in and instruction fold the immediate + // into that and. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(-1); + M->eraseFromParent(); + } + } + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, TRI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else { + MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ + : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; +} + bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { } break; + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: + MadeChange |= optimizeVccBranch(MI); + break; + default: break; } diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll index 72c983d5d97..03c3c5031b3 100644 --- a/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -453,7 +453,7 @@ endif: ; GCN: v_nop_e64 ; GCN: v_nop_e64 ; GCN: ;;#ASMEND -; GCN: s_cbranch_vccz [[RET]] +; GCN: s_cbranch_execz [[RET]] ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1 diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll index 5005b781ca3..e265f5ca4ea 100644 --- a/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,7 +36,7 @@ loop: ; SI: s_and_b64 vcc, exec, -1 ; SI: s_waitcnt lgkmcnt(0) ; SI: buffer_store_dword [[REG]] -; SI: s_cbranch_vccnz [[LOOP]] +; SI: s_cbranch_execnz [[LOOP]] ; SI: [[RET]]: ; %UnifiedReturnBlock ; SI: s_endpgm diff --git a/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir new file mode 100644 index 00000000000..9427b5cd254 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert-skip-from-vcc.mir @@ -0,0 +1,320 @@ +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +# GCN-LABEL: name: and_execz_mov_vccz +# GCN-NOT: S_MOV_ +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_imm_vccz +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_imm_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execnz_imm_vccnz +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec +name: and_execnz_imm_vccnz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_imm_vccz_live_scc +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_imm_vccz_live_scc +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_scc +# GCN-NOT: S_MOV_ +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_scc +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_sreg +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_sreg +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_sreg_commute +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_sreg_commute +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $sgpr0_sgpr1, $exec, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_live_scc_commute +# GCN-NOT: S_MOV_ +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_live_scc_commute +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 killed $sgpr0_sgpr1, $exec, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_commute +# GCN-NOT: S_MOV_ +# GCN-NOT: S_AND_ +# GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_commute +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 killed $sgpr0_sgpr1, $exec, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_exec_vccz +# GCN: $exec = S_MOV_B64 -1 +# GCN-NEXT: S_ENDPGM +name: and_execz_mov_exec_vccz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $exec = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $exec, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_exec_vccnz +# GCN: $exec = S_MOV_B64 -1 +# GCN-NEXT: S_BRANCH %bb.1{{$}} +name: and_execz_mov_exec_vccnz +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $exec = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $exec, implicit-def dead $scc + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_reads_sreg_early +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_reads_sreg_early +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $sgpr2 = S_MOV_B32 $sgpr1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_reads_sreg_late +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr1 +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_reads_sreg_late +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, $sgpr0_sgpr1, implicit-def dead $scc + $sgpr2 = S_MOV_B32 $sgpr1 + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +# GCN-LABEL: name: and_execz_mov_vccz_reads_writes_sreg_early +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr1 = S_MOV_B32 $sgpr0 +# GCN-NEXT: $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc +name: and_execz_mov_vccz_reads_writes_sreg_early +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $sgpr1 = S_MOV_B32 $sgpr0 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_reads_cond +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc +# GCN-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +name: and_execz_mov_vccz_reads_cond +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + $sgpr2 = S_MOV_B32 $vcc_lo + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_mov_vccz_modifies_sreg +# GCN: $sgpr0_sgpr1 = S_MOV_B64 -1 +# GCN-NEXT: $sgpr0 = S_MOV_B32 0 +# GCN-NEXT: $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc +# GCN-NEXT: S_CBRANCH_VCCZ %bb.1, implicit killed $vcc +name: and_execz_mov_vccz_modifies_sreg +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $sgpr0_sgpr1 = S_MOV_B64 -1 + $sgpr0 = S_MOV_B32 0 + $vcc = S_AND_B64 $exec, killed $sgpr0_sgpr1, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM +... +--- +# GCN-LABEL: name: and_execz_imm_vccz_liveout_scc +# GCN: $vcc = S_AND_B64 $exec, -1, implicit-def $scc +# GCN-NEXT: S_CBRANCH_EXECZ %bb.1, implicit $exec +# GCN-NEXT S_ENDPGM implicit $scc +name: and_execz_imm_vccz_liveout_scc +body: | + bb.0: + S_NOP 0 + + bb.1: + S_NOP 0 + + bb.2: + $vcc = S_AND_B64 $exec, -1, implicit-def $scc + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_ENDPGM implicit $scc +...