1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

AMDGPU: Fix infinite loop in DAG combine with fneg + fma

We were not reporting isFNegFree for v2f32, although it is effectively
free after legalization. The generic combine was pulling fneg out of
the fma source operands, and the AMDGPU combine was doing the
opposite.
This commit is contained in:
Matt Arsenault 2021-06-12 11:21:57 -04:00
parent bf6259aa0b
commit 6c04830c65
2 changed files with 27 additions and 3 deletions

View File

@ -848,9 +848,9 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
return VT == MVT::f32 || VT == MVT::f64 ||
(Subtarget->has16BitInsts() && VT == MVT::f16) ||
(Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
// Report this based on the end legalized type.
VT = VT.getScalarType();
return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,

View File

@ -2575,8 +2575,31 @@ define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %
ret void
}
; The AMDGPU combine to pull fneg into the FMA operands was being
; undone by the generic combine to pull the fneg out of the fma if
; !isFNegFree. We were reporting false for v2f32 even though it will
; be split into f32 where it will be free.
; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop:
; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}}
; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]]
; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]]
; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0
; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1
; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4
; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5
; GCN: s_setpc_b64
define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
bb:
%i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
%i4 = fadd fast <2 x float> %i3, %arg
%i5 = fneg <2 x float> %i4
%i6 = fmul fast <2 x float> %i5, %arg2
ret <2 x float> %i6
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fma.f32(float, float, float) #1
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
declare float @llvm.fmuladd.f32(float, float, float) #1
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
declare float @llvm.sin.f32(float) #1
@ -2601,3 +2624,4 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "unsafe-fp-math"="true" }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }