1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[AMDGPU] Fix infinite loop with fma combines

https://reviews.llvm.org/D72312 introduced an infinite loop which involves
DAGCombiner::visitFMA and AMDGPUTargetLowering::performFNegCombine.

fma( a, fneg(b), fneg(c) ) => fneg( fma (a, b, c) ) => fma( a, fneg(b), fneg(c) ) ...

This only breaks with types where 'isFNegFree' returns flase, e.g. v4f32.
Reproducing the issue also needs the attribute 'no-signed-zeros-fp-math',
and no source mods allowed on one of the users of the Op.

This fix makes changes to indicate that it is not free to negate a fma if it
has users with source mods.

Differential Revision: https://reviews.llvm.org/D73939
This commit is contained in:
Austin Kerbow 2020-02-03 17:08:26 -08:00
parent 6a53042562
commit 4d4cde0c25
4 changed files with 84 additions and 13 deletions

View File

@ -734,6 +734,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
} }
} }
char AMDGPUTargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
bool LegalOperations,
bool ForCodeSize,
unsigned Depth) const {
switch (Op.getOpcode()) {
case ISD::FMA:
case ISD::FMAD: {
// Negating a fma is not free if it has users without source mods.
if (!allUsesHaveSourceMods(Op.getNode()))
return 0;
break;
}
default:
break;
}
return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
ForCodeSize, Depth);
}
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//
// Target Properties // Target Properties
//===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===//

View File

@ -172,6 +172,9 @@ public:
bool isZExtFree(EVT Src, EVT Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override;
char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
bool ForCodeSize, unsigned Depth) const override;
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
MVT getVectorIdxTy(const DataLayout &) const override; MVT getVectorIdxTy(const DataLayout &) const override;

View File

@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
declare double @llvm.fabs.f64(double) #0 declare double @llvm.fabs.f64(double) #0
declare double @llvm.fma.f64(double, double, double) #0 declare double @llvm.fma.f64(double, double, double) #0
declare float @llvm.fma.f32(float, float, float) #0 declare float @llvm.fma.f32(float, float, float) #0
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
; (fadd (fmul x, y), z) -> (fma x, y, z) ; (fadd (fmul x, y), z) -> (fma x, y, z)
; FUNC-LABEL: {{^}}combine_to_fma_f64_0: ; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
@ -628,12 +629,12 @@ define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
} }
; Make sure negative constant cancels out fneg ; Make sure negative constant cancels out fneg
; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: ; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] ; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN-NOT: [[A]] ; SI-NOT: [[A]]
; GCN-NOT: [[B]] ; SI-NOT: [[B]]
; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] ; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -650,12 +651,12 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, flo
ret void ret void
} }
; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32: ; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] ; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] ; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
; GCN-NOT: [[A]] ; SI-NOT: [[A]]
; GCN-NOT: [[B]] ; SI-NOT: [[B]]
; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] ; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() %tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
@ -672,6 +673,30 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float a
ret void ret void
} }
; SI-LABEL: {{^}}fma_neg_b_c_v4f32:
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid
%gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1
%gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2
%gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
%tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0
%tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1
%tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2
%fneg0 = fneg fast <4 x float> %tmp0
%fneg1 = fneg fast <4 x float> %tmp1
%fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out
ret void
}
attributes #0 = { nounwind readnone } attributes #0 = { nounwind readnone }
attributes #1 = { nounwind } attributes #1 = { nounwind }
attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }

View File

@ -1399,6 +1399,28 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs
ret void ret void
} }
; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext
%b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext
%c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext
%out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
%a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep
%b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep
%c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep
%fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
%fneg = fneg <4 x float> %fma
store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32: ; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
@ -2520,6 +2542,7 @@ define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %
declare i32 @llvm.amdgcn.workitem.id.x() #1 declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fma.f32(float, float, float) #1 declare float @llvm.fma.f32(float, float, float) #1
declare float @llvm.fmuladd.f32(float, float, float) #1 declare float @llvm.fmuladd.f32(float, float, float) #1
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
declare float @llvm.sin.f32(float) #1 declare float @llvm.sin.f32(float) #1
declare float @llvm.trunc.f32(float) #1 declare float @llvm.trunc.f32(float) #1
declare float @llvm.round.f32(float) #1 declare float @llvm.round.f32(float) #1