From 28862346c29311fc2d3f3e9a9abe72a994933a0d Mon Sep 17 00:00:00 2001 From: Daniil Fukalov Date: Fri, 26 Jan 2018 11:09:38 +0000 Subject: [PATCH] [AMDGPU] fix LDS f32 intrinsics - using qualified pointer addrspace in intrinsics class to avoid .f32 mangling - changed too common atomic mangling to ds - added missing intrinsics to AMDGPUTTIImpl::getTgtMemIntrinsic Reviewed by: b-sumner Differential Revision: https://reviews.llvm.org/D42383 llvm-svn: 323516 --- include/llvm/IR/IntrinsicsAMDGPU.td | 10 +++--- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 11 +++--- lib/Target/AMDGPU/SIISelLowering.cpp | 24 ++++++------- lib/Transforms/Scalar/InferAddressSpaces.cpp | 12 +++---- test/CodeGen/AMDGPU/lds_atomic_f32.ll | 36 +++++++++---------- 5 files changed, 48 insertions(+), 45 deletions(-) diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 454b62bdfb6..dc877e539a4 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -295,10 +295,10 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; -class AMDGPUAtomicF32Intrin : +class AMDGPULDSF32Intrin : GCCBuiltin, Intrinsic<[llvm_float_ty], - [LLVMAnyPointerType, + [LLVMQualPointerType, llvm_float_ty, llvm_i32_ty, // ordering llvm_i32_ty, // scope @@ -306,9 +306,9 @@ class AMDGPUAtomicF32Intrin : [IntrArgMemOnly, NoCapture<0>] >; -def int_amdgcn_atomic_fadd : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fadd">; -def int_amdgcn_atomic_fmin : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fmin">; -def int_amdgcn_atomic_fmax : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fmax">; +def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fadd">; +def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmin">; +def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmax">; class AMDGPUImageLoad : Intrinsic < [llvm_anyfloat_ty], // vdata(VGPR) diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 21088d3e48e..3ad099ca686 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -292,7 +292,10 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const { switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { auto *Ordering = dyn_cast(Inst->getArgOperand(2)); auto *Volatile = dyn_cast(Inst->getArgOperand(4)); if (!Ordering || !Volatile) @@ -475,9 +478,9 @@ static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { case Intrinsic::r600_read_tidig_z: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_atomic_fadd: - case Intrinsic::amdgcn_atomic_fmin: - case Intrinsic::amdgcn_atomic_fmax: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_image_atomic_swap: case Intrinsic::amdgcn_image_atomic_add: case Intrinsic::amdgcn_image_atomic_sub: diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 7dc9dcf31fc..913bf078679 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -566,9 +566,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_atomic_fadd: - case Intrinsic::amdgcn_atomic_fmin: - case Intrinsic::amdgcn_atomic_fmax: { + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -807,9 +807,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_atomic_fadd: - case Intrinsic::amdgcn_atomic_fmin: - case Intrinsic::amdgcn_atomic_fmax: { + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -4827,9 +4827,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_atomic_fadd: - case Intrinsic::amdgcn_atomic_fmin: - case Intrinsic::amdgcn_atomic_fmax: { + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast(Op); unsigned Opc; switch (IntrID) { @@ -4839,13 +4839,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_atomic_dec: Opc = AMDGPUISD::ATOMIC_DEC; break; - case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_ds_fadd: Opc = AMDGPUISD::ATOMIC_LOAD_FADD; break; - case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; break; - case Intrinsic::amdgcn_atomic_fmax: + case Intrinsic::amdgcn_ds_fmax: Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; break; default: diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp index e4591649038..c87077e0dc2 100644 --- a/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -261,9 +261,9 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_atomic_fadd: - case Intrinsic::amdgcn_atomic_fmin: - case Intrinsic::amdgcn_atomic_fmax: { + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { const ConstantInt *IsVolatile = dyn_cast(II->getArgOperand(4)); if (!IsVolatile || !IsVolatile->isZero()) return false; @@ -292,9 +292,9 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands( case Intrinsic::objectsize: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_atomic_fadd: - case Intrinsic::amdgcn_atomic_fmin: - case Intrinsic::amdgcn_atomic_fmax: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; diff --git a/test/CodeGen/AMDGPU/lds_atomic_f32.ll b/test/CodeGen/AMDGPU/lds_atomic_f32.ll index 18aebe12e7f..a33fcf4db02 100644 --- a/test/CodeGen/AMDGPU/lds_atomic_f32.ll +++ b/test/CodeGen/AMDGPU/lds_atomic_f32.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -declare float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fadd(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmin(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmax(float addrspace(3)* nocapture, float, i32, i32, i1) -; GCN-LABEL: {{^}}lds_atomic_fadd_f32: +; GCN-LABEL: {{^}}lds_ds_fadd: ; VI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 @@ -13,20 +13,20 @@ declare float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* nocapture, float, ; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 ; GCN: s_waitcnt lgkmcnt(1) ; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] -define amdgpu_kernel void @lds_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}lds_atomic_fmin_f32: +; GCN-LABEL: {{^}}lds_ds_fmin: ; VI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 @@ -34,20 +34,20 @@ define amdgpu_kernel void @lds_atomic_fadd_f32(float addrspace(1)* %out, float a ; GCN: ds_min_f32 [[V3:v[0-9]+]], [[V0]] offset:64 ; GCN: s_waitcnt lgkmcnt(1) ; GCN: ds_min_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] -define amdgpu_kernel void @lds_atomic_fmin_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +define amdgpu_kernel void @lds_ds_fmin(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}lds_atomic_fmax_f32: +; GCN-LABEL: {{^}}lds_ds_fmax: ; VI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 @@ -55,15 +55,15 @@ define amdgpu_kernel void @lds_atomic_fmin_f32(float addrspace(1)* %out, float a ; GCN: ds_max_f32 [[V3:v[0-9]+]], [[V0]] offset:64 ; GCN: s_waitcnt lgkmcnt(1) ; GCN: ds_max_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] -define amdgpu_kernel void @lds_atomic_fmax_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +define amdgpu_kernel void @lds_ds_fmax(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void }