1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-23 04:52:54 +02:00

AMDGPU: Fix multi-use shl/add combine

This was using a custom function that didn't handle the
addressing modes properly for private. Use
isLegalAddressingMode to avoid duplicating this.

Additionally, skip the combine if there is only one use
since the standard combine will handle it.

llvm-svn: 318013
This commit is contained in:
Matt Arsenault 2017-11-13 05:11:54 +00:00
parent 1924bedd3d
commit 1744d88a1d
3 changed files with 188 additions and 100 deletions

View File

@ -5176,32 +5176,6 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
return SDValue(); return SDValue();
} }
/// \brief Return true if the given offset Size in bytes can be folded into
/// the immediate offsets of a memory instruction for the given address space.
static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
const SISubtarget &STI) {
auto AMDGPUASI = STI.getAMDGPUAS();
if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
// MUBUF instructions a 12-bit offset in bytes.
return isUInt<12>(OffsetSize);
}
if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
// SMRD instructions have an 8-bit offset in dwords on SI and
// a 20-bit offset in bytes on VI.
if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
return isUInt<20>(OffsetSize);
else
return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
}
if (AS == AMDGPUASI.LOCAL_ADDRESS ||
AS == AMDGPUASI.REGION_ADDRESS) {
// The single offset versions have a 16-bit offset in bytes.
return isUInt<16>(OffsetSize);
}
// Indirect register addressing does not use any offsets.
return false;
}
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
// This is a variant of // This is a variant of
@ -5218,11 +5192,15 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
// //
SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
unsigned AddrSpace, unsigned AddrSpace,
EVT MemVT,
DAGCombinerInfo &DCI) const { DAGCombinerInfo &DCI) const {
SDValue N0 = N->getOperand(0); SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1); SDValue N1 = N->getOperand(1);
if (N0.getOpcode() != ISD::ADD) // We only do this to handle cases where it's profitable when there are
// multiple uses of the add, so defer to the standard combine.
// TODO: Support or
if (N0.getOpcode() != ISD::ADD || N0->hasOneUse())
return SDValue(); return SDValue();
const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
@ -5236,7 +5214,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
// If the resulting offset is too large, we can't fold it into the addressing // If the resulting offset is too large, we can't fold it into the addressing
// mode offset. // mode offset.
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
AddrMode AM;
AM.HasBaseReg = true;
AM.BaseOffs = Offset.getSExtValue();
if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
return SDValue(); return SDValue();
SelectionDAG &DAG = DCI.DAG; SelectionDAG &DAG = DCI.DAG;
@ -5256,9 +5239,9 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
SDLoc SL(N); SDLoc SL(N);
// TODO: We could also do this for multiplies. // TODO: We could also do this for multiplies.
unsigned AS = N->getAddressSpace(); if (Ptr.getOpcode() == ISD::SHL) {
if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); N->getMemoryVT(), DCI);
if (NewPtr) { if (NewPtr) {
SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());

View File

@ -88,6 +88,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
DAGCombinerInfo &DCI) const; DAGCombinerInfo &DCI) const;
SDValue performSHLPtrCombine(SDNode *N, SDValue performSHLPtrCombine(SDNode *N,
unsigned AS, unsigned AS,
EVT MemVT,
DAGCombinerInfo &DCI) const; DAGCombinerInfo &DCI) const;
SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const; SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Test that doing a shift of a pointer with a constant add will be ; Test that doing a shift of a pointer with a constant add will be
; folded into the constant offset addressing mode even if the add has ; folded into the constant offset addressing mode even if the add has
@ -15,10 +15,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8 ; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
; SI-LABEL: {{^}}load_shl_base_lds_0: ; GCN-LABEL: {{^}}load_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 ; GCN: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -32,13 +32,13 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add
; Make sure once the first use is folded into the addressing mode, the ; Make sure once the first use is folded into the addressing mode, the
; remaining add use goes through the normal shl + add constant fold. ; remaining add use goes through the normal shl + add constant fold.
; SI-LABEL: {{^}}load_shl_base_lds_1: ; GCN-LABEL: {{^}}load_shl_base_lds_1:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} ; GCN: v_add_i32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
; SI-DAG: buffer_store_dword [[RESULT]] ; GCN-DAG: buffer_store_dword [[RESULT]]
; SI-DAG: buffer_store_dword [[ADDUSE]] ; GCN-DAG: buffer_store_dword [[ADDUSE]]
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -52,9 +52,9 @@ define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 add
@maxlds = addrspace(3) global [65536 x i8] undef, align 4 @maxlds = addrspace(3) global [65536 x i8] undef, align 4
; SI-LABEL: {{^}}load_shl_base_lds_max_offset ; GCN-LABEL: {{^}}load_shl_base_lds_max_offset
; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 ; GCN: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 65535 %idx.0 = add nsw i32 %tid.x, 65535
@ -68,11 +68,11 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i
; The two globals are placed adjacent in memory, so the same base ; The two globals are placed adjacent in memory, so the same base
; pointer can be used with an offset into the second one. ; pointer can be used with an offset into the second one.
; SI-LABEL: {{^}}load_shl_base_lds_2: ; GCN-LABEL: {{^}}load_shl_base_lds_2:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: s_mov_b32 m0, -1 ; GCN: s_mov_b32 m0, -1
; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 ; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64 %idx.0 = add nsw i32 %tid.x, 64
@ -85,10 +85,10 @@ define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
ret void ret void
} }
; SI-LABEL: {{^}}store_shl_base_lds_0: ; GCN-LABEL: {{^}}store_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -115,10 +115,10 @@ define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 ad
; } ; }
; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 ; GCN: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -130,10 +130,10 @@ define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out,
ret void ret void
} }
; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_swap_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -144,10 +144,10 @@ define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i3
ret void ret void
} }
; SI-LABEL: {{^}}atomic_add_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_add_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -158,10 +158,10 @@ define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32
ret void ret void
} }
; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_sub_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -172,10 +172,10 @@ define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32
ret void ret void
} }
; SI-LABEL: {{^}}atomic_and_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_and_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -186,10 +186,10 @@ define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32
ret void ret void
} }
; SI-LABEL: {{^}}atomic_or_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_or_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -200,10 +200,10 @@ define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32
ret void ret void
} }
; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_xor_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -224,10 +224,10 @@ define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32
; ret void ; ret void
; } ; }
; SI-LABEL: {{^}}atomic_min_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_min_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -238,10 +238,10 @@ define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32
ret void ret void
} }
; SI-LABEL: {{^}}atomic_max_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_max_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -252,10 +252,10 @@ define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32
ret void ret void
} }
; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_umin_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -266,10 +266,10 @@ define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i3
ret void ret void
} }
; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0: ; GCN-LABEL: {{^}}atomic_umax_shl_base_lds_0:
; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 ; GCN: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
; SI: s_endpgm ; GCN: s_endpgm
define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2 %idx.0 = add nsw i32 %tid.x, 2
@ -280,5 +280,109 @@ define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i3
ret void ret void
} }
; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds:
; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 {
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
%ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
%ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
store volatile i32 9, i32 addrspace(3)* %ptr0
store volatile i32 10, i32 addrspace(3)* %ptr1
ret void
}
; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_lds_offset:
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528
; GCN-DAG: v_add_i32_e32 [[ADD1:v[0-9]+]], vcc, 0x1fff0, [[SCALE1]]
; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}}
define void @shl_add_ptr_combine_2use_max_lds_offset(i32 %idx) #0 {
%idx.add = add nuw i32 %idx, 8191
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
%ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
%ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
store volatile i32 9, i32 addrspace(3)* %ptr0
store volatile i32 10, i32 addrspace(3)* %ptr1
ret void
}
; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_lds_offset:
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 0x1000, v0
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+$}}
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+$}}
define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
%idx.add = add nuw i32 %idx, 4096
%shl0 = shl i32 %idx.add, 4
%shl1 = shl i32 %idx.add, 5
%ptr0 = inttoptr i32 %shl0 to i32 addrspace(3)*
%ptr1 = inttoptr i32 %shl1 to i32 addrspace(3)*
store volatile i32 9, i32 addrspace(3)* %ptr0
store volatile i32 10, i32 addrspace(3)* %ptr1
ret void
}
; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private:
; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen offset:16
; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s4 offen offset:32
define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
%idx = zext i16 %idx.arg to i32
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 2
%shl1 = shl i32 %idx.add, 3
%ptr0 = inttoptr i32 %shl0 to i32*
%ptr1 = inttoptr i32 %shl1 to i32*
store volatile i32 9, i32* %ptr0
store volatile i32 10, i32* %ptr1
ret void
}
; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset:
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen offset:4088
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]]
; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s4 offen{{$}}
define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 {
%idx = zext i16 %idx.arg to i32
%idx.add = add nuw i32 %idx, 511
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
%ptr0 = inttoptr i32 %shl0 to i32*
%ptr1 = inttoptr i32 %shl1 to i32*
store volatile i32 9, i32* %ptr0
store volatile i32 10, i32* %ptr1
ret void
}
; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_both_max_private_offset:
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]]
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]]
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s4 offen{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s4 offen{{$}}
define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 {
%idx = zext i16 %idx.arg to i32
%idx.add = add nuw i32 %idx, 256
%shl0 = shl i32 %idx.add, 4
%shl1 = shl i32 %idx.add, 5
%ptr0 = inttoptr i32 %shl0 to i32*
%ptr1 = inttoptr i32 %shl1 to i32*
store volatile i32 9, i32* %ptr0
store volatile i32 10, i32* %ptr1
ret void
}
attributes #0 = { nounwind } attributes #0 = { nounwind }
attributes #1 = { nounwind readnone } attributes #1 = { nounwind readnone }