AMDGPU: Add DS append/consume intrinsics

Since these pass the pointer in m0 unlike other DS instructions, these need to worry about whether the address is uniform or not. This assumes the address is dynamically uniform, and just uses readfirstlane to get a copy into an SGPR. I don't know if these have the same 16-bit add for the addressing mode offset problem on SI or not, but I've just assumed they do. Also includes some misc. changes to avoid test differences between the LDS and GDS versions. llvm-svn: 352422
2024-10-19 11:02:59 +02:00 · 2019-01-28 20:14:49 +00:00 · 2019-01-28 20:14:49 +00:00 · 2159d2987e
commit 2159d2987e
parent c9d92c5f65
7 changed files with 355 additions and 17 deletions
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -406,9 +406,20 @@ class AMDGPUDSOrderedIntrinsic : Intrinsic<
  [NoCapture<0>]
 >;

+class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
+  [llvm_i32_ty],
+  [llvm_anyptr_ty, // LDS or GDS ptr
+   llvm_i1_ty], // isVolatile
+   [IntrConvergent, IntrArgMemOnly, NoCapture<0>]
+>;
+
 def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;

+// The pointer argument is assumed to be dynamically uniform if a VGPR.
+def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
+def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
+
 def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
 def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
 def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -106,12 +106,13 @@ private:

  MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;

-  SDNode *glueCopyToM0(SDNode *N) const;
+  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+  SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;

  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
  virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
  virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+  bool isDSOffsetLegal(SDValue Base, unsigned Offset,
                       unsigned OffsetBits) const;
  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@ -209,6 +210,7 @@ private:
  void SelectBRCOND(SDNode *N);
  void SelectFMAD_FMA(SDNode *N);
  void SelectATOMIC_CMP_SWAP(SDNode *N);
+  void SelectINTRINSIC_W_CHAIN(SDNode *N);

 protected:
  // Include the pieces autogenerated from the target description.
@ -339,29 +341,32 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
  }
 }

-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
-  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
-      !Subtarget->ldsRequiresM0Init())
-    return N;
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
  const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
+    *static_cast<const SITargetLowering*>(getTargetLowering());

  // Write max value to m0 before each load operation

  SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
-                                 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+                                 Val);

  SDValue Glue = M0.getValue(1);

  SmallVector <SDValue, 8> Ops;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-     Ops.push_back(N->getOperand(i));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Ops.push_back(N->getOperand(i));
+
  Ops.push_back(Glue);
  return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
 }

+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
+  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
+      !Subtarget->ldsRequiresM0Init())
+    return N;
+  return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+}
+
 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
                                                  EVT VT) const {
  SDNode *Lo = CurDAG->getMachineNode(
@ -472,7 +477,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       Opc == ISD::ATOMIC_LOAD_FADD ||
       Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
-    N = glueCopyToM0(N);
+    N = glueCopyToM0LDSInit(N);

  switch (Opc) {
  default:
@ -570,7 +575,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
  case ISD::STORE:
  case ISD::ATOMIC_LOAD:
  case ISD::ATOMIC_STORE: {
-    N = glueCopyToM0(N);
+    N = glueCopyToM0LDSInit(N);
    break;
  }

@ -648,6 +653,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
      SelectCode(N);
      return;
    }
+
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    SelectINTRINSIC_W_CHAIN(N);
+    return;
  }
  }

@ -828,7 +839,7 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }

-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
                                         unsigned OffsetBits) const {
  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
      (OffsetBits == 8 && !isUInt<8>(Offset)))
@ -1760,6 +1771,52 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
  CurDAG->RemoveDeadNode(N);
 }

+void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
+  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  if ((IntrID != Intrinsic::amdgcn_ds_append &&
+       IntrID != Intrinsic::amdgcn_ds_consume) ||
+      N->getValueType(0) != MVT::i32) {
+    SelectCode(N);
+    return;
+  }
+
+  // The address is assumed to be uniform, so if it ends up in a VGPR, it will
+  // be copied to an SGPR with readfirstlane.
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(2);
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+  SDValue Offset;
+  if (CurDAG->isBaseWithConstantOffset(Ptr)) {
+    SDValue PtrBase = Ptr.getOperand(0);
+    SDValue PtrOffset = Ptr.getOperand(1);
+
+    const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
+    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+      N = glueCopyToM0(N, PtrBase);
+      Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
+    }
+  }
+
+  if (!Offset) {
+    N = glueCopyToM0(N, Ptr);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+  }
+
+  SDValue Ops[] = {
+    Offset,
+    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+    Chain,
+    N->getOperand(N->getNumOperands() - 1) // New glue
+  };
+
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
                                            unsigned &Mods) const {
  Mods = 0;
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@ -109,7 +109,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
      // can't represent this with range metadata because it's only allowed for
      // integer types.
-      if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+      if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+           PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
          ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
        continue;

--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -926,7 +926,20 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

    return true;
  }
+  case Intrinsic::amdgcn_ds_append:
+  case Intrinsic::amdgcn_ds_consume: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;

+    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(1));
+    if (!Vol || !Vol->isZero())
+      Info.flags |= MachineMemOperand::MOVolatile;
+
+    return true;
+  }
  default:
    return false;
  }
@ -1978,7 +1991,8 @@ SDValue SITargetLowering::LowerFormalArguments(
      auto *ParamTy =
        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
-          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+          ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+                      ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
        // On SI local pointers are just offsets into LDS, so they are always
        // less than 16-bits.  On CI and newer they could potentially be
        // real pointers, so we can't guarantee their size.
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -275,6 +275,11 @@ bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
    if (OffsetImm) {
      // Normal, single offset LDS instruction.
      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+      // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
+      // report that here?
+      if (!BaseOp)
+        return false;
+
      Offset = OffsetImm->getImm();
      assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
                                "operands of type register.");
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
@ -0,0 +1,125 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+
+; GCN-LABEL: {{^}}ds_append_lds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_no_fold_offset_si:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_add_i32 [[PTR]], [[PTR]], 16
+; SI: s_mov_b32 m0, [[PTR]]
+; SI: ds_append [[RESULT:v[0-9]+]]{{$}}
+
+; CIPLUS: s_mov_b32 m0, [[PTR]]
+; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}}
+
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
+  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_over_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_bitset1_b32 [[PTR]], 16
+; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000
+
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr:
+; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN: s_mov_b32 m0, [[READLANE]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_gds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_gds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_gds_over_max_offset:
+define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_m0_restore:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; NOTGFX9: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+; GCN: _store_dword
+; GCN: ds_read_b32
+define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false)
+  store i32 %val0, i32 addrspace(1)* %out
+  %val1 = load volatile i32, i32 addrspace(3)* %lds
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1) #1
+declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly convergent nounwind }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
@ -0,0 +1,125 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+
+; GCN-LABEL: {{^}}ds_consume_lds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_add_i32 [[PTR]], [[PTR]], 16
+; SI: s_mov_b32 m0, [[PTR]]
+; SI: ds_consume [[RESULT:v[0-9]+]]{{$}}
+
+; CIPLUS: s_mov_b32 m0, [[PTR]]
+; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}}
+
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
+  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_bitset1_b32 [[PTR]], 16
+; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000
+
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr:
+; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN: s_mov_b32 m0, [[READLANE]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_gds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_gds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset:
+define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_m0_restore:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; NOTGFX9: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+; GCN: _store_dword
+; GCN: ds_read_b32
+define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false)
+  store i32 %val0, i32 addrspace(1)* %out
+  %val1 = load volatile i32, i32 addrspace(3)* %lds
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1) #1
+declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly convergent nounwind }