From e479314184130f2ac91964272902bb37a4d00d34 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 10 Aug 2020 12:25:48 -0400 Subject: [PATCH] AMDGPU: Handle intrinsics in performMemSDNodeCombine This avoids a possible regression in a future patch --- lib/Target/AMDGPU/SIISelLowering.cpp | 24 ++++++++++++++++---- test/CodeGen/AMDGPU/shl_add_ptr_csub.ll | 22 ++++++++++++++++++ test/CodeGen/AMDGPU/shl_add_ptr_global.ll | 27 +++++++++++++++++++---- 3 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 test/CodeGen/AMDGPU/shl_add_ptr_csub.ll diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index e32270652fe..ddb84b4e81f 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -870,6 +870,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); // FIXME: In other contexts we pretend this is a per-function property. setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); @@ -8563,14 +8565,28 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); } +/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset +/// by the chain and intrinsic ID. Theoretically we would also need to check the +/// specific intrinsic. +static unsigned getBasePtrIndex(const MemSDNode *N) { + switch (N->getOpcode()) { + case ISD::STORE: + case ISD::INTRINSIC_W_CHAIN: + case ISD::INTRINSIC_VOID: + return 2; + default: + return 1; + } +} + SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const { - // FIXME: getBasePtr does not work correctly for intrinsic nodes and will find - // the intrinsic ID, not the pointer. - SDValue Ptr = N->getBasePtr(); SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); + unsigned PtrIdx = getBasePtrIndex(N); + SDValue Ptr = N->getOperand(PtrIdx); + // TODO: We could also do this for multiplies. if (Ptr.getOpcode() == ISD::SHL) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), @@ -8578,7 +8594,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, if (NewPtr) { SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + NewOps[PtrIdx] = NewPtr; return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } } diff --git a/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll b/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll new file mode 100644 index 00000000000..002ed5c4d2b --- /dev/null +++ b/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s + +; GCN-LABEL: {{^}}shl_base_atomicrmw_global_atomic_csub_ptr: +; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 43 +; GCN: v_add_co_u32_e64 v[[EXTRA_LO:[0-9]+]], vcc_lo, 0x80, v4 +; GCN: v_add_co_ci_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc_lo, 0, v5, vcc_lo +; GCN: global_atomic_csub v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 glc +; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} +define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 + %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 + %shl = shl i64 %cast, 2 + %castback = inttoptr i64 %shl to i32 addrspace(1)* + %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %castback, i32 43) + store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4 + ret i32 %val +} + +declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0 + +attributes #0 = { argmemonly nounwind } diff --git a/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/test/CodeGen/AMDGPU/shl_add_ptr_global.ll index 2260de0b74f..fb74c0829fc 100644 --- a/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ b/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -1,13 +1,13 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; GCN-LABEL: {{^}}shl_base_global_ptr: +; GCN-LABEL: {{^}}shl_base_atomicrmw_global_ptr: ; GCN: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 ; GCN: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc ; GCN: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 ; GCN: global_atomic_and v{{\[}}[[LO]]:[[HI]]{{\]}}, [[THREE]], off offset:512 ; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} -define void @shl_base_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { +define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 %shl = shl i64 %cast, 2 @@ -17,5 +17,24 @@ define void @shl_base_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extr ret void } +; GCN-LABEL: {{^}}shl_base_global_ptr_global_atomic_fadd: +; GCN: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 +; GCN: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc +; GCN: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; GCN: global_atomic_add_f32 v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 +; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} +define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 + %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 + %shl = shl i64 %cast, 2 + %castback = inttoptr i64 %shl to float addrspace(1)* + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %castback, float 100.0) + store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4 + ret void +} + +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #1 + attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { argmemonly nounwind willreturn }