1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

[AMDGPU] Add llvm.amdgcn.wqm.demote intrinsic

Add intrinsic which demotes all active lanes to helper lanes.
This is used to implement demote to helper Vulkan extension.

In practice demoting a lane to helper simply means removing it
from the mask of live lanes used for WQM/WWM/Exact mode.
Where the shader does not use WQM, demotes just become kills.

Additionally add llvm.amdgcn.live.mask intrinsic to complement
demote operations. In theory llvm.amdgcn.ps.live can be used
to detect helper lanes; however, ps.live can be moved by LICM.
The movement of ps.live cannot be remedied without changing
its type signature and such a change would require ps.live
users to update as well.

Reviewed By: piotr

Differential Revision: https://reviews.llvm.org/D94747
This commit is contained in:
Carl Ritson 2021-02-14 09:52:41 +09:00
parent 615b38354f
commit 730fd62aad
10 changed files with 2559 additions and 23 deletions

View File

@ -1349,13 +1349,18 @@ def int_amdgcn_interp_p2_f16 :
[IntrNoMem, IntrSpeculatable, IntrWillReturn,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
// Pixel shaders only: whether the current pixel is live (i.e. not a helper
// invocation for derivative computation).
// Deprecated: use llvm.amdgcn.live.mask instead.
def int_amdgcn_ps_live : Intrinsic <
[llvm_i1_ty],
[],
[IntrNoMem, IntrWillReturn]>;
// Query currently live lanes.
// Returns true if lane is live (and not a helper lane).
def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty],
[], [IntrReadMem, IntrInaccessibleMemOnly]
>;
def int_amdgcn_mbcnt_lo :
GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
@ -1585,6 +1590,11 @@ def int_amdgcn_endpgm : GCCBuiltin<"__builtin_amdgcn_endpgm">,
Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
>;
// If false, mark all active lanes as helper lanes until the end of program.
def int_amdgcn_wqm_demote : Intrinsic<[],
[llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]
>;
// Copies the active channels of the source value to the destination value,
// with the guarantee that the source value is computed as if the entire
// program were executed in Whole Wavefront Mode, i.e. with all channels

View File

@ -4194,6 +4194,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
case Intrinsic::amdgcn_live_mask: {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
}
case Intrinsic::amdgcn_wqm_demote:
case Intrinsic::amdgcn_kill: {
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;

View File

@ -243,6 +243,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;

View File

@ -378,6 +378,18 @@ def SI_PS_LIVE : PseudoInstSI <
let SALU = 1;
}
let Uses = [EXEC] in {
def SI_LIVE_MASK : PseudoInstSI <
(outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_live_mask))]> {
let SALU = 1;
}
let Defs = [EXEC,SCC] in {
// Demote: Turn a pixel shader thread into a helper lane.
def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>;
} // End Defs = [EXEC,SCC]
} // End Uses = [EXEC]
def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
[(int_amdgcn_unreachable)],
"; divergent unreachable"> {
@ -751,6 +763,16 @@ def : Pat <
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;
def : Pat <
(int_amdgcn_wqm_demote i1:$src),
(SI_DEMOTE_I1 SCSrc_i1:$src, 0)
>;
def : Pat <
(int_amdgcn_wqm_demote (i1 (not i1:$src))),
(SI_DEMOTE_I1 SCSrc_i1:$src, -1)
>;
// TODO: we could add more variants for other types of conditionals
def : Pat <

View File

@ -72,6 +72,7 @@ private:
MachineRegisterInfo *MRI = nullptr;
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
SmallSet<MachineBasicBlock *, 4> KillBlocks;
const TargetRegisterClass *BoolRC = nullptr;
unsigned AndOpc;
@ -84,6 +85,8 @@ private:
unsigned OrSaveExecOpc;
unsigned Exec;
bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
void emitIf(MachineInstr &MI);
void emitElse(MachineInstr &MI);
void emitIfBreak(MachineInstr &MI);
@ -161,8 +164,8 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
static bool hasKill(const MachineBasicBlock *Begin,
const MachineBasicBlock *End, const SIInstrInfo *TII) {
bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
const MachineBasicBlock *End) {
DenseSet<const MachineBasicBlock*> Visited;
SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
@ -171,9 +174,8 @@ static bool hasKill(const MachineBasicBlock *Begin,
if (MBB == End || !Visited.insert(MBB).second)
continue;
for (auto &Term : MBB->terminators())
if (TII->isKillTerminator(Term.getOpcode()))
return true;
if (KillBlocks.contains(MBB))
return true;
Worklist.append(MBB->succ_begin(), MBB->succ_end());
}
@ -213,7 +215,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// Check for SI_KILL_*_TERMINATOR on path from if to endif.
// if there is any such terminator simplifications are not safe.
auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
}
// Add an implicit def of exec to discourage scheduling VALU after this which
@ -799,6 +801,28 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
Exec = AMDGPU::EXEC;
}
// Compute set of blocks with kills
const bool CanDemote =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
for (auto &MBB : MF) {
bool IsKillBlock = false;
for (auto &Term : MBB.terminators()) {
if (TII->isKillTerminator(Term.getOpcode())) {
KillBlocks.insert(&MBB);
IsKillBlock = true;
break;
}
}
if (CanDemote && !IsKillBlock) {
for (auto &MI : MBB) {
if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
KillBlocks.insert(&MBB);
break;
}
}
}
}
MachineFunction::iterator NextBB;
for (MachineFunction::iterator BI = MF.begin();
BI != MF.end(); BI = NextBB) {
@ -848,6 +872,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
LoweredEndCf.clear();
LoweredIf.clear();
KillBlocks.clear();
return true;
}

View File

@ -457,10 +457,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
III.Disabled = StateWQM | StateWWM;
continue;
} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {
if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
LiveMaskQueries.push_back(&MI);
} else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) {
Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
Opcode == AMDGPU::SI_DEMOTE_I1) {
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (WQMOutputs) {
@ -799,6 +800,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
const DebugLoc &DL = MI.getDebugLoc();
MachineInstr *MaskUpdateMI = nullptr;
const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
const MachineOperand &Op = MI.getOperand(0);
int64_t KillVal = MI.getOperand(1).getImm();
MachineInstr *ComputeKilledMaskMI = nullptr;
@ -815,10 +817,14 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
} else {
// Static: kill does nothing
MachineInstr *NewTerm = nullptr;
assert(MBB.succ_size() == 1);
NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(*MBB.succ_begin());
LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
if (IsDemote) {
LIS->RemoveMachineInstrFromMaps(MI);
} else {
assert(MBB.succ_size() == 1);
NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(*MBB.succ_begin());
LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
}
MBB.remove(&MI);
return NewTerm;
}
@ -848,17 +854,30 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
// In the case we got this far some lanes are still live,
// update EXEC to deactivate lanes as appropriate.
MachineInstr *NewTerm;
if (Op.isImm()) {
unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
} else if (!IsWQM) {
NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
MachineInstr *WQMMaskMI = nullptr;
Register LiveMaskWQM;
if (IsDemote) {
// Demotes deactive quads with only helper lanes
LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
WQMMaskMI =
BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
.addReg(LiveMaskWQM);
} else {
unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
NewTerm =
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
// Kills deactivate lanes
if (Op.isImm()) {
unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
} else if (!IsWQM) {
NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
} else {
unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
NewTerm =
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
}
}
// Update live intervals
@ -871,6 +890,8 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
if (WQMMaskMI)
LIS->InsertMachineInstrInMaps(*WQMMaskMI);
LIS->InsertMachineInstrInMaps(*NewTerm);
if (CndReg) {
@ -879,6 +900,8 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
}
if (TmpReg)
LIS->createAndComputeVirtRegInterval(TmpReg);
if (LiveMaskWQM)
LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
return NewTerm;
}
@ -910,6 +933,7 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
MachineInstr *SplitPoint = nullptr;
switch (MI.getOpcode()) {
case AMDGPU::SI_DEMOTE_I1:
case AMDGPU::SI_KILL_I1_TERMINATOR:
SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
break;
@ -1319,6 +1343,7 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
MachineBasicBlock *MBB = MI->getParent();
MachineInstr *SplitPoint = nullptr;
switch (MI->getOpcode()) {
case AMDGPU::SI_DEMOTE_I1:
case AMDGPU::SI_KILL_I1_TERMINATOR:
SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
break;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
---
name: live_mask
legalized: true
body: |
bb.0:
; CHECK-LABEL: name: live_mask
; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask)
; CHECK: S_ENDPGM 0, implicit [[INT]](s1)
%0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask)
S_ENDPGM 0, implicit %0
...

View File

@ -0,0 +1,69 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s| FileCheck %s
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s| FileCheck %s
---
name: wqm_demote_scc
legalized: true
body: |
bb.0:
liveins: $sgpr0, $sgpr1
; CHECK-LABEL: name: wqm_demote_scc
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2
...
---
name: wqm_demote_vcc
legalized: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
; CHECK-LABEL: name: wqm_demote_vcc
; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s1) = G_ICMP intpred(eq), %0, %1
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2
...
---
name: wqm_demote_constant_true
legalized: true
body: |
bb.0:
; CHECK-LABEL: name: wqm_demote_constant_true
; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true
; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1)
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
%0:_(s1) = G_CONSTANT i1 true
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
...
---
name: wqm_demote_constant_false
legalized: true
body: |
bb.0:
; CHECK-LABEL: name: wqm_demote_constant_false
; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false
; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1)
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
%0:_(s1) = G_CONSTANT i1 false
G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
...

File diff suppressed because it is too large Load Diff