1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

AMDGPU: Add DS append/consume intrinsics

Since these pass the pointer in m0 unlike other DS instructions, these
need to worry about whether the address is uniform or not. This
assumes the address is dynamically uniform, and just uses
readfirstlane to get a copy into an SGPR.

I don't know if these have the same 16-bit add for the addressing mode
offset problem on SI or not, but I've just assumed they do.

Also includes some misc. changes to avoid test differences between the
LDS and GDS versions.

llvm-svn: 352422
This commit is contained in:
Matt Arsenault 2019-01-28 20:14:49 +00:00
parent c9d92c5f65
commit 2159d2987e
7 changed files with 355 additions and 17 deletions

View File

@ -406,9 +406,20 @@ class AMDGPUDSOrderedIntrinsic : Intrinsic<
[NoCapture<0>]
>;
class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
[llvm_i32_ty],
[llvm_anyptr_ty, // LDS or GDS ptr
llvm_i1_ty], // isVolatile
[IntrConvergent, IntrArgMemOnly, NoCapture<0>]
>;
def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
// The pointer argument is assumed to be dynamically uniform if a VGPR.
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;

View File

@ -106,12 +106,13 @@ private:
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
SDNode *glueCopyToM0(SDNode *N) const;
SDNode *glueCopyToM0LDSInit(SDNode *N) const;
SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
bool isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@ -209,6 +210,7 @@ private:
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
@ -339,29 +341,32 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
!Subtarget->ldsRequiresM0Init())
return N;
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
const SITargetLowering& Lowering =
*static_cast<const SITargetLowering*>(getTargetLowering());
*static_cast<const SITargetLowering*>(getTargetLowering());
// Write max value to m0 before each load operation
SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
Val);
SDValue Glue = M0.getValue(1);
SmallVector <SDValue, 8> Ops;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
Ops.push_back(N->getOperand(i));
}
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
Ops.push_back(N->getOperand(i));
Ops.push_back(Glue);
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
!Subtarget->ldsRequiresM0Init())
return N;
return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
}
MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
EVT VT) const {
SDNode *Lo = CurDAG->getMachineNode(
@ -472,7 +477,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
N = glueCopyToM0(N);
N = glueCopyToM0LDSInit(N);
switch (Opc) {
default:
@ -570,7 +575,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: {
N = glueCopyToM0(N);
N = glueCopyToM0LDSInit(N);
break;
}
@ -648,6 +653,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectCode(N);
return;
}
break;
}
case ISD::INTRINSIC_W_CHAIN: {
SelectINTRINSIC_W_CHAIN(N);
return;
}
}
@ -828,7 +839,7 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
(OffsetBits == 8 && !isUInt<8>(Offset)))
@ -1760,6 +1771,52 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
CurDAG->RemoveDeadNode(N);
}
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
if ((IntrID != Intrinsic::amdgcn_ds_append &&
IntrID != Intrinsic::amdgcn_ds_consume) ||
N->getValueType(0) != MVT::i32) {
SelectCode(N);
return;
}
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
SDValue Chain = N->getOperand(0);
SDValue Ptr = N->getOperand(2);
MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
SDValue Offset;
if (CurDAG->isBaseWithConstantOffset(Ptr)) {
SDValue PtrBase = Ptr.getOperand(0);
SDValue PtrOffset = Ptr.getOperand(1);
const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
N = glueCopyToM0(N, PtrBase);
Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
}
}
if (!Offset) {
N = glueCopyToM0(N, Ptr);
Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
}
SDValue Ops[] = {
Offset,
CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
Chain,
N->getOperand(N->getNumOperands() - 1) // New glue
};
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
unsigned &Mods) const {
Mods = 0;

View File

@ -109,7 +109,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
// can't represent this with range metadata because it's only allowed for
// integer types.
if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
continue;

View File

@ -926,7 +926,20 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Info.align = 0;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(1));
if (!Vol || !Vol->isZero())
Info.flags |= MachineMemOperand::MOVolatile;
return true;
}
default:
return false;
}
@ -1978,7 +1991,8 @@ SDValue SITargetLowering::LowerFormalArguments(
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.

View File

@ -275,6 +275,11 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
if (OffsetImm) {
// Normal, single offset LDS instruction.
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
// TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
// report that here?
if (!BaseOp)
return false;
Offset = OffsetImm->getImm();
assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
"operands of type register.");

View File

@ -0,0 +1,125 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; GCN-LABEL: {{^}}ds_append_lds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_lds_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_no_fold_offset_si:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; SI: s_add_i32 [[PTR]], [[PTR]], 16
; SI: s_mov_b32 m0, [[PTR]]
; SI: ds_append [[RESULT:v[0-9]+]]{{$}}
; CIPLUS: s_mov_b32 m0, [[PTR]]
; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
%lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_lds_over_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; SI: s_bitset1_b32 [[PTR]], 16
; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
; GCN: s_mov_b32 m0, [[READLANE]]
; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_gds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
%val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_gds_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
%val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_gds_over_max_offset:
define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
%val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_append_lds_m0_restore:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
; NOTGFX9: s_mov_b32 m0, -1
; GFX9-NOT: m0
; GCN: _store_dword
; GCN: ds_read_b32
define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
store i32 %val0, i32 addrspace(1)* %out
%val1 = load volatile i32, i32 addrspace(3)* %lds
ret void
}
declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1) #1
declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1) #1
attributes #0 = { nounwind }
attributes #1 = { argmemonly convergent nounwind }

View File

@ -0,0 +1,125 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
; GCN-LABEL: {{^}}ds_consume_lds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_lds_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; SI: s_add_i32 [[PTR]], [[PTR]], 16
; SI: s_mov_b32 m0, [[PTR]]
; SI: ds_consume [[RESULT:v[0-9]+]]{{$}}
; CIPLUS: s_mov_b32 m0, [[PTR]]
; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
%lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; SI: s_bitset1_b32 [[PTR]], 16
; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
; GCN: s_mov_b32 m0, [[READLANE]]
; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_gds:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
%val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_gds_max_offset:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
; GCN: {{.*}}store{{.*}} [[RESULT]]
define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
%val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset:
define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
%val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false)
store i32 %val, i32 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}ds_consume_lds_m0_restore:
; GCN: s_load_dword [[PTR:s[0-9]+]]
; GCN: s_mov_b32 m0, [[PTR]]
; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
; NOTGFX9: s_mov_b32 m0, -1
; GFX9-NOT: m0
; GCN: _store_dword
; GCN: ds_read_b32
define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
%val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
store i32 %val0, i32 addrspace(1)* %out
%val1 = load volatile i32, i32 addrspace(3)* %lds
ret void
}
declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1) #1
declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1) #1
attributes #0 = { nounwind }
attributes #1 = { argmemonly convergent nounwind }