mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
AMDGPU: Insert mem_viol check loop around GWS pre-GFX9
It is necessary to emit this loop around GWS operations in case the wave is preempted pre-GFX9. llvm-svn: 363979
This commit is contained in:
parent
03cc6d624e
commit
d44a022d4e
@ -715,6 +715,15 @@ public:
|
||||
return getGeneration() < GFX9;
|
||||
}
|
||||
|
||||
// True if the hardware rewinds and replays GWS operations if a wave is
|
||||
// preempted.
|
||||
//
|
||||
// If this is false, a GWS operation requires testing if a nack set the
|
||||
// MEM_VIOL bit, and repeating if so.
|
||||
bool hasGWSAutoReplay() const {
|
||||
return getGeneration() >= GFX9;
|
||||
}
|
||||
|
||||
bool hasAddNoCarry() const {
|
||||
return AddNoCarryInsts;
|
||||
}
|
||||
|
@ -467,7 +467,7 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>;
|
||||
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
|
||||
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
|
||||
|
||||
let isConvergent = 1 in {
|
||||
let isConvergent = 1, usesCustomInserter = 1 in {
|
||||
def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
|
||||
let mayLoad = 0;
|
||||
}
|
||||
|
@ -323,6 +323,8 @@ enum Offset : unsigned { // Offset, (5) [10:6]
|
||||
OFFSET_WIDTH_ = 5,
|
||||
OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
|
||||
|
||||
OFFSET_MEM_VIOL = 8,
|
||||
|
||||
OFFSET_SRC_SHARED_BASE = 16,
|
||||
OFFSET_SRC_PRIVATE_BASE = 0
|
||||
};
|
||||
|
@ -2922,6 +2922,109 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
|
||||
return SplitBB;
|
||||
}
|
||||
|
||||
// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
|
||||
// \p MI will be the only instruction in the loop body block. Otherwise, it will
|
||||
// be the first instruction in the remainder block.
|
||||
//
|
||||
/// \returns { LoopBody, Remainder }
|
||||
static std::pair<MachineBasicBlock *, MachineBasicBlock *>
|
||||
splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
|
||||
MachineFunction *MF = MBB.getParent();
|
||||
MachineBasicBlock::iterator I(&MI);
|
||||
|
||||
// To insert the loop we need to split the block. Move everything after this
|
||||
// point to a new block, and insert a new empty block between the two.
|
||||
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
|
||||
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
|
||||
MachineFunction::iterator MBBI(MBB);
|
||||
++MBBI;
|
||||
|
||||
MF->insert(MBBI, LoopBB);
|
||||
MF->insert(MBBI, RemainderBB);
|
||||
|
||||
LoopBB->addSuccessor(LoopBB);
|
||||
LoopBB->addSuccessor(RemainderBB);
|
||||
|
||||
// Move the rest of the block into a new block.
|
||||
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
||||
|
||||
if (InstInLoop) {
|
||||
auto Next = std::next(I);
|
||||
|
||||
// Move instruction to loop body.
|
||||
LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
|
||||
|
||||
// Move the rest of the block.
|
||||
RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
|
||||
} else {
|
||||
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
|
||||
}
|
||||
|
||||
MBB.addSuccessor(LoopBB);
|
||||
|
||||
return std::make_pair(LoopBB, RemainderBB);
|
||||
}
|
||||
|
||||
MachineBasicBlock *
|
||||
SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const {
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
||||
|
||||
MachineBasicBlock *LoopBB;
|
||||
MachineBasicBlock *RemainderBB;
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
|
||||
MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
|
||||
|
||||
std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
|
||||
|
||||
MachineBasicBlock::iterator I = LoopBB->end();
|
||||
|
||||
MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
|
||||
assert(Src && "missing operand from GWS instruction");
|
||||
|
||||
const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
|
||||
AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
|
||||
|
||||
// Clear TRAP_STS.MEM_VIOL
|
||||
BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
|
||||
.addImm(0)
|
||||
.addImm(EncodedReg);
|
||||
|
||||
// This is a pain, but we're not allowed to have physical register live-ins
|
||||
// yet. Insert a pair of copies if the VGPR0 hack is necessary.
|
||||
if (TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
|
||||
unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
|
||||
.add(*Src);
|
||||
|
||||
BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
|
||||
.addReg(Data0);
|
||||
|
||||
MRI.setSimpleHint(Data0, Src->getReg());
|
||||
}
|
||||
|
||||
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
|
||||
.addImm(0);
|
||||
|
||||
unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
|
||||
// Load and check TRAP_STS.MEM_VIOL
|
||||
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
|
||||
.addImm(EncodedReg);
|
||||
|
||||
// FIXME: Do we need to use an isel pseudo that may clobber scc?
|
||||
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
|
||||
.addReg(Reg, RegState::Kill)
|
||||
.addImm(0);
|
||||
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
|
||||
.addMBB(LoopBB);
|
||||
|
||||
return RemainderBB;
|
||||
}
|
||||
|
||||
// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
|
||||
// wavefront. If the value is uniform and just happens to be in a VGPR, this
|
||||
// will only do one iteration. In the worst case, this will loop 64 times.
|
||||
@ -3061,24 +3164,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
|
||||
BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
|
||||
.addReg(Exec);
|
||||
|
||||
// To insert the loop we need to split the block. Move everything after this
|
||||
// point to a new block, and insert a new empty block between the two.
|
||||
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
|
||||
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
|
||||
MachineFunction::iterator MBBI(MBB);
|
||||
++MBBI;
|
||||
|
||||
MF->insert(MBBI, LoopBB);
|
||||
MF->insert(MBBI, RemainderBB);
|
||||
|
||||
LoopBB->addSuccessor(LoopBB);
|
||||
LoopBB->addSuccessor(RemainderBB);
|
||||
|
||||
// Move the rest of the block into a new block.
|
||||
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
|
||||
RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
|
||||
|
||||
MBB.addSuccessor(LoopBB);
|
||||
MachineBasicBlock *LoopBB;
|
||||
MachineBasicBlock *RemainderBB;
|
||||
std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
|
||||
|
||||
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
|
||||
|
||||
@ -3630,6 +3718,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
}
|
||||
case AMDGPU::DS_GWS_INIT:
|
||||
case AMDGPU::DS_GWS_SEMA_V:
|
||||
case AMDGPU::DS_GWS_SEMA_BR:
|
||||
case AMDGPU::DS_GWS_SEMA_P:
|
||||
case AMDGPU::DS_GWS_BARRIER:
|
||||
if (getSubtarget()->hasGWSAutoReplay())
|
||||
return BB;
|
||||
return emitGWSMemViolTestLoop(MI, BB);
|
||||
default:
|
||||
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
|
||||
}
|
||||
|
@ -313,6 +313,9 @@ public:
|
||||
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
|
||||
MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
|
||||
MachineBasicBlock *
|
||||
EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const override;
|
||||
|
@ -1,14 +1,23 @@
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s
|
||||
|
||||
; Minimum offset
|
||||
; GCN-LABEL: {{^}}gws_barrier_offset0:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; NOLOOP: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
|
||||
; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]:
|
||||
; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
|
||||
; LOOP-NEXT: ds_gws_barrier v0 offset:1 gds
|
||||
; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)
|
||||
; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0
|
||||
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||
define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
ret void
|
||||
@ -16,10 +25,10 @@ define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
||||
|
||||
; Maximum offset
|
||||
; GCN-LABEL: {{^}}gws_barrier_offset63:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:64 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_barrier v0 offset:64 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
|
||||
ret void
|
||||
@ -27,11 +36,11 @@ define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
|
||||
|
||||
; FIXME: Should be able to shift directly into m0
|
||||
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_barrier v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
||||
ret void
|
||||
@ -39,11 +48,11 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
|
||||
|
||||
; Variable offset in SGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
|
||||
%offset = add i32 %offset.base, 1
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
|
||||
@ -51,12 +60,12 @@ define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.ba
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_barrier v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
|
||||
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
|
||||
@ -65,12 +74,12 @@ define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
|
||||
|
||||
; Variable offset in VGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_barrier v0 offset:3 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_barrier v0 offset:3 gds{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
|
||||
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vgpr.offset = add i32 %vgpr.offset.base, 3
|
||||
@ -82,8 +91,8 @@ define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
|
||||
|
||||
; Check if m0 initialization is shared
|
||||
; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN-NOT: s_mov_b32 m0
|
||||
; NOLOOP: s_mov_b32 m0, -1
|
||||
; NOLOOP-NOT: s_mov_b32 m0
|
||||
define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
|
||||
store i32 1, i32 addrspace(3)* @lds
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
|
||||
@ -93,9 +102,9 @@ define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val)
|
||||
|
||||
; Make sure this increments lgkmcnt
|
||||
; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
|
||||
; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}}
|
||||
; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_setpc_b64
|
||||
define void @gws_barrier_lgkmcnt(i32 %val) {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
|
||||
ret void
|
||||
@ -103,9 +112,8 @@ define void @gws_barrier_lgkmcnt(i32 %val) {
|
||||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_barrier_wait_before:
|
||||
; GCN: store_dword
|
||||
; CIPLUS-NOT: s_waitcnt
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP: s_waitcnt
|
||||
; NOLOOP-NOT: s_waitcnt{{$}}
|
||||
define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
@ -113,9 +121,9 @@ define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_barrier_wait_after:
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
; GCN-NEXT: s_waitcnt expcnt(0){{$}}
|
||||
; GCN-NEXT: load_dword
|
||||
; NOLOOP: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt expcnt(0){{$}}
|
||||
; NOLOOP-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
%load = load volatile i32, i32 addrspace(1)* %ptr
|
||||
@ -124,9 +132,9 @@ define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %p
|
||||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_barrier_fence_before:
|
||||
; GCN: store_dword
|
||||
; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP: store_dword
|
||||
; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; NOLOOP: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
fence release
|
||||
@ -135,9 +143,10 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)*
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_barrier_fence_after:
|
||||
; GCN: ds_gws_barrier v0 offset:8 gds
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: load_dword
|
||||
; NOLOOP: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; NOLOOP-NEXT: load_dword
|
||||
define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
fence release
|
||||
@ -147,9 +156,9 @@ define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %
|
||||
|
||||
; FIXME: Should a wait be inserted here, or is an explicit fence needed?
|
||||
; GCN-LABEL: {{^}}gws_init_barrier:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN: ds_gws_init v0 offset:8 gds
|
||||
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP: s_mov_b32 m0, -1
|
||||
; NOLOOP: ds_gws_init v0 offset:8 gds
|
||||
; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
|
||||
@ -158,10 +167,11 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
|
||||
|
||||
; FIXME: Why vmcnt, not expcnt?
|
||||
; GCN-LABEL: {{^}}gws_init_fence_barrier:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN: ds_gws_init v0 offset:8 gds
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
; NOLOOP: s_mov_b32 m0, -1
|
||||
; NOLOOP: ds_gws_init v0 offset:8 gds
|
||||
; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds
|
||||
define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
fence release
|
||||
|
@ -1,14 +1,23 @@
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
|
||||
|
||||
; Minimum offset
|
||||
; GCN-LABEL: {{^}}gws_init_offset0:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
||||
; NOLOOP: ds_gws_init v0 offset:1 gds{{$}}
|
||||
|
||||
; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]:
|
||||
; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
|
||||
; LOOP-NEXT: ds_gws_init v0 offset:1 gds
|
||||
; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)
|
||||
; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0
|
||||
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||
define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
||||
ret void
|
||||
@ -16,10 +25,19 @@ define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
|
||||
|
||||
; Maximum offset
|
||||
; GCN-LABEL: {{^}}gws_init_offset63:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:64 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: s_mov_b32 m0, -1{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_init v0 offset:64 gds{{$}}
|
||||
|
||||
|
||||
; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]:
|
||||
; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0
|
||||
; LOOP-NEXT: ds_gws_init v0 offset:64 gds
|
||||
; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)
|
||||
; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0
|
||||
; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||
define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
|
||||
ret void
|
||||
@ -27,11 +45,11 @@ define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
|
||||
|
||||
; FIXME: Should be able to shift directly into m0
|
||||
; GCN-LABEL: {{^}}gws_init_sgpr_offset:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_init v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
||||
ret void
|
||||
@ -39,11 +57,11 @@ define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
|
||||
|
||||
; Variable offset in SGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_init v0 offset:1 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
|
||||
%offset = add i32 %offset.base, 1
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
|
||||
@ -51,12 +69,12 @@ define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base)
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_init_vgpr_offset:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_init v0 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
|
||||
%vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
|
||||
@ -65,12 +83,12 @@ define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
|
||||
|
||||
; Variable offset in VGPR with constant add
|
||||
; GCN-LABEL: {{^}}gws_init_vgpr_offset_add:
|
||||
; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; GCN: ds_gws_init v0 offset:3 gds{{$}}
|
||||
; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
|
||||
; NOLOOP-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
|
||||
; NOLOOP-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
|
||||
; NOLOOP-DAG: s_mov_b32 m0, [[SHL]]{{$}}
|
||||
; NOLOOP-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
|
||||
; NOLOOP: ds_gws_init v0 offset:3 gds{{$}}
|
||||
define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
|
||||
%vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%vgpr.offset = add i32 %vgpr.offset.base, 3
|
||||
@ -82,8 +100,8 @@ define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
|
||||
|
||||
; Check if m0 initialization is shared.
|
||||
; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset:
|
||||
; GCN: s_mov_b32 m0, -1
|
||||
; GCN-NOT: s_mov_b32 m0
|
||||
; NOLOOP: s_mov_b32 m0, -1
|
||||
; NOLOOP-NOT: s_mov_b32 m0
|
||||
define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
|
||||
store i32 1, i32 addrspace(3)* @lds
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
|
||||
@ -92,9 +110,9 @@ define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}gws_init_lgkmcnt:
|
||||
; GCN: ds_gws_init v0 offset:1 gds{{$}}
|
||||
; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
; NOLOOP: ds_gws_init v0 offset:1 gds{{$}}
|
||||
; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
|
||||
; NOLOOP-NEXT: s_setpc_b64
|
||||
define void @gws_init_lgkmcnt(i32 %val) {
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
|
||||
ret void
|
||||
@ -102,9 +120,8 @@ define void @gws_init_lgkmcnt(i32 %val) {
|
||||
|
||||
; Does not imply memory fence on its own
|
||||
; GCN-LABEL: {{^}}gws_init_wait_before:
|
||||
; GCN: store_dword
|
||||
; CIPLUS-NOT: s_waitcnt
|
||||
; GCN: ds_gws_init v0 offset:8 gds
|
||||
; NOLOOP: s_waitcnt
|
||||
; NOLOOP-NOT: s_waitcnt
|
||||
define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
|
||||
store i32 0, i32 addrspace(1)* %ptr
|
||||
call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
|
||||
|
Loading…
Reference in New Issue
Block a user