mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
[AMDGPU] Simplify nested SI_END_CF
This is to replace the optimization from the SIOptimizeExecMaskingPreRA. We have less opportunities in the control flow lowering because many VGPR copies are still in place and will be removed later, but we know for sure an instruction is SI_END_CF and not just an arbitrary S_OR_B64 with EXEC. The subsequent change needs to convert s_and_saveexec into s_and and address new TODO lines in tests, then code block guarded by the -amdgpu-remove-redundant-endcf option in the pre-RA exec mask optimizer will be removed. Differential Revision: https://reviews.llvm.org/D76033
This commit is contained in:
parent
4560a0277a
commit
8e802d9c40
@ -51,6 +51,7 @@
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/CodeGen/LiveIntervals.h"
|
||||
@ -81,6 +82,7 @@ private:
|
||||
const SIInstrInfo *TII = nullptr;
|
||||
LiveIntervals *LIS = nullptr;
|
||||
MachineRegisterInfo *MRI = nullptr;
|
||||
DenseSet<const MachineInstr*> LoweredEndCf;
|
||||
|
||||
const TargetRegisterClass *BoolRC = nullptr;
|
||||
unsigned AndOpc;
|
||||
@ -103,6 +105,13 @@ private:
|
||||
|
||||
void combineMasks(MachineInstr &MI);
|
||||
|
||||
// Skip to the next instruction, ignoring debug instructions, and trivial
|
||||
// block boundaries (blocks that have one (typically fallthrough) successor,
|
||||
// and the successor has one predecessor.
|
||||
MachineBasicBlock::iterator
|
||||
skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator It) const;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
@ -396,6 +405,36 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator
|
||||
SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
|
||||
MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
|
||||
|
||||
SmallSet<const MachineBasicBlock *, 4> Visited;
|
||||
MachineBasicBlock *B = &MBB;
|
||||
do {
|
||||
if (!Visited.insert(B).second)
|
||||
return MBB.end();
|
||||
|
||||
auto E = B->end();
|
||||
for ( ; It != E; ++It) {
|
||||
if (TII->mayReadEXEC(*MRI, *It))
|
||||
break;
|
||||
}
|
||||
|
||||
if (It != E)
|
||||
return It;
|
||||
|
||||
if (B->succ_size() != 1)
|
||||
return MBB.end();
|
||||
|
||||
// If there is one trivial successor, advance to the next block.
|
||||
MachineBasicBlock *Succ = *B->succ_begin();
|
||||
|
||||
It = Succ->begin();
|
||||
B = Succ;
|
||||
} while (true);
|
||||
}
|
||||
|
||||
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
|
||||
@ -403,6 +442,18 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
|
||||
MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
// If the only instruction immediately following this END_CF is an another
|
||||
// END_CF in the only successor we can avoid emitting exec mask restore here.
|
||||
auto Next = skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI.getIterator()));
|
||||
if (Next != MBB.end() && (Next->getOpcode() == AMDGPU::SI_END_CF ||
|
||||
LoweredEndCf.count(&*Next))) {
|
||||
LLVM_DEBUG(dbgs() << "Skip redundant "; MI.dump());
|
||||
if (LIS)
|
||||
LIS->RemoveMachineInstrFromMaps(MI);
|
||||
MI.eraseFromParent();
|
||||
return;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator InsPt =
|
||||
Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
|
||||
: MBB.begin();
|
||||
@ -410,6 +461,8 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
|
||||
.addReg(Exec)
|
||||
.add(MI.getOperand(0));
|
||||
|
||||
LoweredEndCf.insert(NewMI);
|
||||
|
||||
if (LIS)
|
||||
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
|
||||
|
||||
@ -556,5 +609,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
||||
}
|
||||
}
|
||||
|
||||
LoweredEndCf.clear();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-remove-redundant-endcf < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-opt-exec-mask-pre-ra=0 < %s | FileCheck -enable-var-scope -check-prefixes=DISABLED,ALL %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; ALL-LABEL: {{^}}simple_nested_if:
|
||||
; GCN-LABEL: {{^}}simple_nested_if:
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]]
|
||||
; GCN: s_and_b64 exec, exec, vcc
|
||||
|
||||
; TODO: this does not need to save exec, just perform the and.
|
||||
; GCN: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc
|
||||
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF]]
|
||||
; GCN-NEXT: ; %bb.{{[0-9]+}}:
|
||||
; GCN: store_dword
|
||||
@ -13,9 +15,6 @@
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_endpgm
|
||||
|
||||
|
||||
; DISABLED: s_or_b64 exec, exec
|
||||
; DISABLED: s_or_b64 exec, exec
|
||||
define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -39,7 +38,7 @@ bb.outer.end: ; preds = %bb.outer.then, %bb.
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}uncollapsable_nested_if:
|
||||
; GCN-LABEL: {{^}}uncollapsable_nested_if:
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
|
||||
@ -82,7 +81,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}nested_if_if_else:
|
||||
; GCN-LABEL: {{^}}nested_if_if_else:
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF_OUTER:BB[0-9_]+]]
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC_INNER:s\[[0-9:]+\]]]
|
||||
@ -128,7 +127,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}nested_if_else_if:
|
||||
; GCN-LABEL: {{^}}nested_if_else_if:
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER:s\[[0-9:]+\]]]
|
||||
; GCN-NEXT: s_xor_b64 [[SAVEEXEC_OUTER2:s\[[0-9:]+\]]], exec, [[SAVEEXEC_OUTER]]
|
||||
; GCN-NEXT: s_cbranch_execz [[THEN_OUTER:BB[0-9_]+]]
|
||||
@ -151,9 +150,9 @@ bb.outer.end: ; preds = %bb, %bb.then, %b
|
||||
; GCN-NEXT: ; %bb.{{[0-9]+}}:
|
||||
; GCN: store_dword
|
||||
; GCN-NEXT: [[FLOW1]]:
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]]
|
||||
; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
|
||||
; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
|
||||
; GCN-NOT: s_or_b64 exec
|
||||
; GCN-NOT: {{^.*:}}
|
||||
; GCN: ds_write_b32
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
|
||||
@ -191,7 +190,7 @@ bb.outer.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}s_endpgm_unsafe_barrier:
|
||||
; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier:
|
||||
; GCN: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
|
||||
; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9_]+]]
|
||||
; GCN-NEXT: ; %bb.{{[0-9]+}}:
|
||||
@ -216,8 +215,7 @@ bb.end: ; preds = %bb.then, %bb
|
||||
ret void
|
||||
}
|
||||
|
||||
; Make sure scc liveness is updated if sor_b64 is removed
|
||||
; ALL-LABEL: {{^}}scc_liveness:
|
||||
; GCN-LABEL: {{^}}scc_liveness:
|
||||
|
||||
; GCN: %bb10
|
||||
; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
|
||||
@ -229,7 +227,9 @@ bb.end: ; preds = %bb.then, %bb
|
||||
; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
|
||||
|
||||
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
|
||||
; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}}
|
||||
|
||||
; TODO: this does not need to save exec, just perform the and.
|
||||
; GCN: s_and_saveexec_b64 s[{{[0-9:]+}}], {{vcc|s\[[0-9:]+\]}}
|
||||
|
||||
; GCN-NOT: s_or_b64 exec, exec
|
||||
|
||||
|
@ -46,7 +46,6 @@ body: |
|
||||
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
|
||||
; GCN: DBG_VALUE
|
||||
; GCN: bb.4:
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
|
||||
@ -146,7 +145,6 @@ body: |
|
||||
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
|
||||
; GCN: bb.4:
|
||||
; GCN: successors: %bb.5(0x80000000)
|
||||
; GCN: bb.5:
|
||||
@ -246,7 +244,6 @@ body: |
|
||||
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
|
||||
; GCN: bb.4:
|
||||
; GCN: successors: %bb.5(0x80000000)
|
||||
; GCN: DBG_VALUE
|
||||
@ -347,7 +344,6 @@ body: |
|
||||
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
|
||||
; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
|
||||
; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
|
||||
; GCN: KILL [[DEF]]
|
||||
@ -450,7 +446,6 @@ body: |
|
||||
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.4(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
|
||||
; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
|
||||
; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]]
|
||||
; GCN: KILL [[DEF]]
|
||||
@ -749,7 +744,6 @@ body: |
|
||||
; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %7, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
|
||||
; GCN: bb.3:
|
||||
; GCN: successors: %bb.5(0x80000000)
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
|
||||
; GCN: S_BRANCH %bb.5
|
||||
; GCN: bb.4:
|
||||
; GCN: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
|
||||
|
@ -58,7 +58,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
|
||||
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
|
||||
; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
|
||||
; GFX9-NEXT: s_cbranch_execz BB1_4
|
||||
; GFX9-NEXT: s_cbranch_execz BB1_3
|
||||
; GFX9-NEXT: ; %bb.1: ; %bb19
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
|
||||
; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v6
|
||||
@ -100,9 +100,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
|
||||
; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13]
|
||||
; GFX9-NEXT: s_cbranch_execnz BB1_2
|
||||
; GFX9-NEXT: ; %bb.3: ; %Flow
|
||||
; GFX9-NEXT: s_or_b64 exec, exec, s[12:13]
|
||||
; GFX9-NEXT: BB1_4: ; %Flow3
|
||||
; GFX9-NEXT: BB1_3: ; %Flow3
|
||||
; GFX9-NEXT: s_or_b64 exec, exec, s[10:11]
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
|
Loading…
Reference in New Issue
Block a user