diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 5ba95fa90f5..a223c2d19b0 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -734,6 +734,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { } } +char AMDGPUTargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + switch (Op.getOpcode()) { + case ISD::FMA: + case ISD::FMAD: { + // Negating a fma is not free if it has users without source mods. + if (!allUsesHaveSourceMods(Op.getNode())) + return 0; + break; + } + default: + break; + } + + return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 7b269e8b21e..3847be2cc59 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -172,6 +172,9 @@ public: bool isZExtFree(EVT Src, EVT Dest) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations, + bool ForCodeSize, unsigned Depth) const override; + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; MVT getVectorIdxTy(const DataLayout &) const override; diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll index 3b3c87f17f7..a962a3b4ed0 100644 --- a/test/CodeGen/AMDGPU/fma-combine.ll +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare double @llvm.fabs.f64(double) #0 declare double @llvm.fma.f64(double, double, double) #0 declare float @llvm.fma.f32(float, float, float) #0 +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0 ; (fadd (fmul x, y), z) -> (fma x, y, z) ; FUNC-LABEL: {{^}}combine_to_fma_f64_0: @@ -628,12 +629,12 @@ define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, } ; Make sure negative constant cancels out fneg -; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN-NOT: [[A]] -; GCN-NOT: [[B]] -; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] +; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: +; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SI-NOT: [[A]] +; SI-NOT: [[B]] +; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -650,12 +651,12 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, flo ret void } -; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN-NOT: [[A]] -; GCN-NOT: [[B]] -; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] +; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32: +; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; SI-NOT: [[A]] +; SI-NOT: [[B]] +; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -672,6 +673,30 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float a ret void } +; SI-LABEL: {{^}}fma_neg_b_c_v4f32: +; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2 + %gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid + + %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0 + %tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1 + %tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2 + + %fneg0 = fneg fast <4 x float> %tmp0 + %fneg1 = fneg fast <4 x float> %tmp1 + %fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1) + + store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } - +attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 133afd4d32c..f0aeb2276f1 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1399,6 +1399,28 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs ret void } +; GCN-LABEL: {{^}}v_fneg_fmad_v4f32: + +; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} +define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext + %c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext + %a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep + %b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep + %c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep + %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) + %fneg = fneg <4 x float> %fma + store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep + ret void +} + ; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] @@ -2520,6 +2542,7 @@ define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* % declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare float @llvm.fmuladd.f32(float, float, float) #1 +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 declare float @llvm.sin.f32(float) #1 declare float @llvm.trunc.f32(float) #1 declare float @llvm.round.f32(float) #1