mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 11:02:59 +02:00
[AMDGPU] Fix infinite loop with fma combines
https://reviews.llvm.org/D72312 introduced an infinite loop which involves DAGCombiner::visitFMA and AMDGPUTargetLowering::performFNegCombine. fma( a, fneg(b), fneg(c) ) => fneg( fma (a, b, c) ) => fma( a, fneg(b), fneg(c) ) ... This only breaks with types where 'isFNegFree' returns flase, e.g. v4f32. Reproducing the issue also needs the attribute 'no-signed-zeros-fp-math', and no source mods allowed on one of the users of the Op. This fix makes changes to indicate that it is not free to negate a fma if it has users with source mods. Differential Revision: https://reviews.llvm.org/D73939
This commit is contained in:
parent
6a53042562
commit
4d4cde0c25
@ -734,6 +734,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
|
||||
}
|
||||
}
|
||||
|
||||
char AMDGPUTargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
|
||||
bool LegalOperations,
|
||||
bool ForCodeSize,
|
||||
unsigned Depth) const {
|
||||
switch (Op.getOpcode()) {
|
||||
case ISD::FMA:
|
||||
case ISD::FMAD: {
|
||||
// Negating a fma is not free if it has users without source mods.
|
||||
if (!allUsesHaveSourceMods(Op.getNode()))
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
|
||||
ForCodeSize, Depth);
|
||||
}
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
// Target Properties
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
@ -172,6 +172,9 @@ public:
|
||||
bool isZExtFree(EVT Src, EVT Dest) const override;
|
||||
bool isZExtFree(SDValue Val, EVT VT2) const override;
|
||||
|
||||
char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
|
||||
bool ForCodeSize, unsigned Depth) const override;
|
||||
|
||||
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
|
||||
|
||||
MVT getVectorIdxTy(const DataLayout &) const override;
|
||||
|
@ -10,6 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
declare double @llvm.fabs.f64(double) #0
|
||||
declare double @llvm.fma.f64(double, double, double) #0
|
||||
declare float @llvm.fma.f32(float, float, float) #0
|
||||
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
|
||||
|
||||
; (fadd (fmul x, y), z) -> (fma x, y, z)
|
||||
; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
|
||||
@ -628,12 +629,12 @@ define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
|
||||
}
|
||||
|
||||
; Make sure negative constant cancels out fneg
|
||||
; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN-NOT: [[A]]
|
||||
; GCN-NOT: [[B]]
|
||||
; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
|
||||
; SI-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
|
||||
; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; SI-NOT: [[A]]
|
||||
; SI-NOT: [[B]]
|
||||
; SI: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
|
||||
define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
|
||||
@ -650,12 +651,12 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, flo
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN-NOT: [[A]]
|
||||
; GCN-NOT: [[B]]
|
||||
; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
|
||||
; SI-LABEL: {{^}}fma_2.0_neg_a_b_f32:
|
||||
; SI: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; SI: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; SI-NOT: [[A]]
|
||||
; SI-NOT: [[B]]
|
||||
; SI: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
|
||||
define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
|
||||
@ -672,6 +673,30 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float a
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}fma_neg_b_c_v4f32:
|
||||
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; SI: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
define amdgpu_kernel void @fma_neg_b_c_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #2 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.0 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %tid
|
||||
%gep.1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.0, i32 1
|
||||
%gep.2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %gep.1, i32 2
|
||||
%gep.out = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
|
||||
|
||||
%tmp0 = load <4 x float>, <4 x float> addrspace(1)* %gep.0
|
||||
%tmp1 = load <4 x float>, <4 x float> addrspace(1)* %gep.1
|
||||
%tmp2 = load <4 x float>, <4 x float> addrspace(1)* %gep.2
|
||||
|
||||
%fneg0 = fneg fast <4 x float> %tmp0
|
||||
%fneg1 = fneg fast <4 x float> %tmp1
|
||||
%fma0 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %tmp2, <4 x float> %fneg0, <4 x float> %fneg1)
|
||||
|
||||
store <4 x float> %fma0, <4 x float> addrspace(1)* %gep.out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
||||
attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" }
|
||||
|
@ -1399,6 +1399,28 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_fneg_fmad_v4f32:
|
||||
|
||||
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_fneg_fmad_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %a.ptr, <4 x float> addrspace(1)* %b.ptr, <4 x float> addrspace(1)* %c.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%c.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %c.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile <4 x float>, <4 x float> addrspace(1)* %a.gep
|
||||
%b = load volatile <4 x float>, <4 x float> addrspace(1)* %b.gep
|
||||
%c = load volatile <4 x float>, <4 x float> addrspace(1)* %c.gep
|
||||
%fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
|
||||
%fneg = fneg <4 x float> %fma
|
||||
store <4 x float> %fneg, <4 x float> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32:
|
||||
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
|
||||
@ -2520,6 +2542,7 @@ define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
declare float @llvm.fma.f32(float, float, float) #1
|
||||
declare float @llvm.fmuladd.f32(float, float, float) #1
|
||||
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
|
||||
declare float @llvm.sin.f32(float) #1
|
||||
declare float @llvm.trunc.f32(float) #1
|
||||
declare float @llvm.round.f32(float) #1
|
||||
|
Loading…
Reference in New Issue
Block a user