From ebf08bfb482b4c44e4a64e196473d915d08b8c26 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 6 Apr 2017 16:48:30 +0000 Subject: [PATCH] [AMDGPU] Eliminate barrier if workgroup size is not greater than wavefront size If a workgroup size is known to be not greater than wavefront size the s_barrier instruction is not needed since all threads are guarantied to come to the same point at the same time. Differential Revision: https://reviews.llvm.org/D31731 llvm-svn: 299659 --- lib/Target/AMDGPU/SIISelLowering.cpp | 11 ++++++++ test/CodeGen/AMDGPU/barrier-elimination.ll | 30 ++++++++++++++++++++++ test/CodeGen/AMDGPU/indirect-private-64.ll | 2 +- 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/AMDGPU/barrier-elimination.ll diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 36f16e3ca9a..3052439358a 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3159,6 +3159,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); } + case Intrinsic::amdgcn_s_barrier: { + if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { + const MachineFunction &MF = DAG.getMachineFunction(); + const SISubtarget &ST = MF.getSubtarget(); + unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second; + if (WGSize <= ST.getWavefrontSize()) + return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, + Op.getOperand(0)), 0); + } + return SDValue(); + }; default: return Op; } diff --git a/test/CodeGen/AMDGPU/barrier-elimination.ll b/test/CodeGen/AMDGPU/barrier-elimination.ll new file mode 100644 index 00000000000..c526baaab9c --- /dev/null +++ b/test/CodeGen/AMDGPU/barrier-elimination.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s + +; CHECK-LABEL: {{^}}unknown_wgs: +; CHECK: s_barrier +define amdgpu_kernel void @unknown_wgs() { + tail call void @llvm.amdgcn.s.barrier() #0 + ret void +} + +; CHECK-LABEL: {{^}}flat_wgs_attr_32_128: +; CHECK: s_barrier +define amdgpu_kernel void @flat_wgs_attr_32_128() #1 { + tail call void @llvm.amdgcn.s.barrier() #0 + ret void +} + +; CHECK-LABEL: {{^}}flat_wgs_attr_32_64: +; CHECK: : +; CHECK-NEXT: ; wave barrier +; CHECK-NEXT: s_endpgm +define amdgpu_kernel void @flat_wgs_attr_32_64() #2 { + tail call void @llvm.amdgcn.s.barrier() #0 + ret void +} + +declare void @llvm.amdgcn.s.barrier() #0 + +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" } +attributes #2 = { nounwind "amdgpu-flat-work-group-size"="32,64" } diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll index b6dea01cab4..7f08a89d149 100644 --- a/test/CodeGen/AMDGPU/indirect-private-64.ll +++ b/test/CodeGen/AMDGPU/indirect-private-64.ll @@ -121,4 +121,4 @@ define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* n } attributes #0 = { convergent nounwind } -attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,64" } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,128" }