1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

AMDGPU: Use splat vectors for undefs when folding canonicalize

If one of the elements is undef, use the canonicalized constant
from the other element instead of 0.

Splat vectors are more useful for other optimizations, such
as matching vector clamps. This was breaking on clamps
of half3 from the undef 4th component.

llvm-svn: 339512
This commit is contained in:
Matt Arsenault 2018-08-12 08:42:54 +00:00
parent 50e345a4f8
commit cc2045f6d7
3 changed files with 111 additions and 13 deletions

View File

@ -6989,27 +6989,42 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
// TODO: This could be better with wider vectors that will be split to v2f16,
// and to consider uses since there aren't that many packed operations.
if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
isTypeLegal(MVT::v2f16)) {
SDLoc SL(N);
SDValue NewElts[2];
SDValue Lo = N0.getOperand(0);
SDValue Hi = N0.getOperand(1);
EVT EltVT = Lo.getValueType();
if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
for (unsigned I = 0; I != 2; ++I) {
SDValue Op = N0.getOperand(I);
EVT EltVT = Op.getValueType();
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
CFP->getValueAPF());
} else if (Op.isUndef()) {
// This would ordinarily be folded to a qNaN. Since this may be half
// of a packed operation, it may be cheaper to use a 0.
NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
// Handled below based on what the other operand is.
NewElts[I] = Op;
} else {
NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
}
}
// If one half is undef, and one is constant, perfer a splat vector rather
// than the normal qNaN. If it's a register, prefer 0.0 since that's
// cheaper to use and may be free with a packed operation.
if (NewElts[0].isUndef()) {
if (isa<ConstantFPSDNode>(NewElts[1]))
NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
}
if (NewElts[1].isUndef()) {
NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
}
return DAG.getBuildVector(VT, SL, NewElts);
}
}

View File

@ -688,6 +688,38 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out,
ret void
}
; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GFX9-NOT: [[A]]
; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GFX9-NOT: [[A]]
; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
ret void
}
; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
; GCN: v_add_f32_e32 [[A:v[0-9]+]]
; GCN: v_add_f32_e32 [[B:v[0-9]+]]

View File

@ -565,20 +565,71 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
; GFX9-NEXT: s_setpc_b64
; VI: s_waitcnt
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_setpc_b64
; GFX89: s_waitcnt
; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
%vec = insertelement <2 x half> undef, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v1, 1.0
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 1.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 1.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 16.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 16.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
; GFX9: s_waitcnt
; GFX9-DAG: v_max_f16_e32 v0, v0, v0