1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 20:23:11 +01:00

[AMDGPU] Simplify nested SI_END_CF

This is to replace the optimization from the SIOptimizeExecMaskingPreRA.
We have less opportunities in the control flow lowering because many
VGPR copies are still in place and will be removed later, but we know
for sure an instruction is SI_END_CF and not just an arbitrary S_OR_B64
with EXEC.

The subsequent change needs to convert s_and_saveexec into s_and and
address new TODO lines in tests, then code block guarded by the
-amdgpu-remove-redundant-endcf option in the pre-RA exec mask optimizer
will be removed.

Differential Revision: https://reviews.llvm.org/D76033
This commit is contained in:
Stanislav Mekhanoshin 2020-03-11 13:17:32 -07:00
parent 4560a0277a
commit 8e802d9c40
4 changed files with 74 additions and 27 deletions

View File

@ -51,6 +51,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveIntervals.h"
@ -81,6 +82,7 @@ private:
const SIInstrInfo *TII = nullptr;
LiveIntervals *LIS = nullptr;
MachineRegisterInfo *MRI = nullptr;
DenseSet<const MachineInstr*> LoweredEndCf;
const TargetRegisterClass *BoolRC = nullptr;
unsigned AndOpc;
@ -103,6 +105,13 @@ private:
void combineMasks(MachineInstr &MI);
// Skip to the next instruction, ignoring debug instructions, and trivial
// block boundaries (blocks that have one (typically fallthrough) successor,
// and the successor has one predecessor.
MachineBasicBlock::iterator
skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
MachineBasicBlock::iterator It) const;
public:
static char ID;
@ -396,6 +405,36 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MI.eraseFromParent();
}
MachineBasicBlock::iterator
SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
SmallSet<const MachineBasicBlock *, 4> Visited;
MachineBasicBlock *B = &MBB;
do {
if (!Visited.insert(B).second)
return MBB.end();
auto E = B->end();
for ( ; It != E; ++It) {
if (TII->mayReadEXEC(*MRI, *It))
break;
}
if (It != E)
return It;
if (B->succ_size() != 1)
return MBB.end();
// If there is one trivial successor, advance to the next block.
MachineBasicBlock *Succ = *B->succ_begin();
It = Succ->begin();
B = Succ;
} while (true);
}
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@ -403,6 +442,18 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
const DebugLoc &DL = MI.getDebugLoc();
// If the only instruction immediately following this END_CF is an another
// END_CF in the only successor we can avoid emitting exec mask restore here.
auto Next = skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator()));
if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF ||
LoweredEndCf.count(&*Next))) {
LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
if (LIS)
LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
return;
}
MachineBasicBlock::iterator InsPt =
Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
: MBB.begin();
@ -410,6 +461,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
.addReg(Exec)
.add(MI.getOperand(0));
LoweredEndCf.insert(NewMI);
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@ -556,5 +609,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
}
}
LoweredEndCf.clear();
return true;
}

View File

@ -1,10 +1,12 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-opt-exec-mask-pre-ra=0 < %s | FileCheck -enable-var-scope -check-prefixes=DISABLED,ALL %s
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; ALL-LABEL: {{^}}simple_nested_if:
; GCN-LABEL: {{^}}simple_nested_if:
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]]
; GCN: s_and_b64 exec, exec, vcc
; TODO: this does not need to save exec, just perform the and.
; GCN: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc
; GCN-NEXT: s_cbranch_execz [[ENDIF]]
; GCN-NEXT: ; %bb.{{[0-9]+}}:
; GCN: store_dword
@ -13,9 +15,6 @@
; GCN: ds_write_b32
; GCN: s_endpgm
; DISABLED: s_or_b64 exec, exec
; DISABLED: s_or_b64 exec, exec
define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -39,7 +38,7 @@ bb.outer.end: ; preds = %bb.outer.then, %bb.
ret void
}
; ALL-LABEL: {{^}}uncollapsable_nested_if:
; GCN-LABEL: {{^}}uncollapsable_nested_if:
; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
@ -82,7 +81,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb
ret void
}
; ALL-LABEL: {{^}}nested_if_if_else:
; GCN-LABEL: {{^}}nested_if_if_else:
; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
@ -128,7 +127,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
ret void
}
; ALL-LABEL: {{^}}nested_if_else_if:
; GCN-LABEL: {{^}}nested_if_else_if:
; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]]
; GCN-NEXT: s_cbranch_execz [[THEN_OUTER:BB[0-9_]+]]
@ -151,9 +150,9 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
; GCN-NEXT: ; %bb.{{[0-9]+}}:
; GCN: store_dword
; GCN-NEXT: [[FLOW1]]:
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]]
; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
; GCN-NOT: s_or_b64 exec
; GCN-NOT: {{^.*:}}
; GCN: ds_write_b32
; GCN: s_endpgm
define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
@ -191,7 +190,7 @@ bb.outer.end:
ret void
}
; ALL-LABEL: {{^}}s_endpgm_unsafe_barrier:
; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier:
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]]
; GCN-NEXT: ; %bb.{{[0-9]+}}:
@ -216,8 +215,7 @@ bb.end: ; preds = %bb.then, %bb
ret void
}
; Make sure scc liveness is updated if sor_b64 is removed
; ALL-LABEL: {{^}}scc_liveness:
; GCN-LABEL: {{^}}scc_liveness:
; GCN: %bb10
; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
@ -229,7 +227,9 @@ bb.end: ; preds = %bb.then, %bb
; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}}
; TODO: this does not need to save exec, just perform the and.
; GCN: s_and_saveexec_b64 s[{{[0-9:]+}}], {{vcc|s\[[0-9:]+\]}}
; GCN-NOT: s_or_b64 exec, exec

View File

@ -46,7 +46,6 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; GCN: bb.3:
; GCN: successors: %bb.4(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
; GCN: DBG_VALUE
; GCN: bb.4:
; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
@ -146,7 +145,6 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; GCN: bb.3:
; GCN: successors: %bb.4(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
; GCN: bb.4:
; GCN: successors: %bb.5(0x80000000)
; GCN: bb.5:
@ -246,7 +244,6 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; GCN: bb.3:
; GCN: successors: %bb.4(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
; GCN: bb.4:
; GCN: successors: %bb.5(0x80000000)
; GCN: DBG_VALUE
@ -347,7 +344,6 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; GCN: bb.3:
; GCN: successors: %bb.4(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
; GCN: KILL [[DEF]]
@ -450,7 +446,6 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; GCN: bb.3:
; GCN: successors: %bb.4(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
; GCN: KILL [[DEF]]
@ -749,7 +744,6 @@ body: |
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; GCN: bb.3:
; GCN: successors: %bb.5(0x80000000)
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
; GCN: S_BRANCH %bb.5
; GCN: bb.4:
; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc

View File

@ -58,7 +58,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
; GFX9-NEXT: s_cbranch_execz BB1_4
; GFX9-NEXT: s_cbranch_execz BB1_3
; GFX9-NEXT: ; %bb.1: ; %bb19
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6
@ -100,9 +100,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13]
; GFX9-NEXT: s_cbranch_execnz BB1_2
; GFX9-NEXT: ; %bb.3: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[12:13]
; GFX9-NEXT: BB1_4: ; %Flow3
; GFX9-NEXT: BB1_3: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]