mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
AMDGPU: Use splat vectors for undefs when folding canonicalize
If one of the elements is undef, use the canonicalized constant from the other element instead of 0. Splat vectors are more useful for other optimizations, such as matching vector clamps. This was breaking on clamps of half3 from the undef 4th component. llvm-svn: 339512
This commit is contained in:
parent
50e345a4f8
commit
cc2045f6d7
@ -6989,27 +6989,42 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
|
||||
|
||||
// TODO: This could be better with wider vectors that will be split to v2f16,
|
||||
// and to consider uses since there aren't that many packed operations.
|
||||
if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
|
||||
if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
|
||||
isTypeLegal(MVT::v2f16)) {
|
||||
SDLoc SL(N);
|
||||
SDValue NewElts[2];
|
||||
SDValue Lo = N0.getOperand(0);
|
||||
SDValue Hi = N0.getOperand(1);
|
||||
EVT EltVT = Lo.getValueType();
|
||||
|
||||
if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
|
||||
for (unsigned I = 0; I != 2; ++I) {
|
||||
SDValue Op = N0.getOperand(I);
|
||||
EVT EltVT = Op.getValueType();
|
||||
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
|
||||
NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
|
||||
CFP->getValueAPF());
|
||||
} else if (Op.isUndef()) {
|
||||
// This would ordinarily be folded to a qNaN. Since this may be half
|
||||
// of a packed operation, it may be cheaper to use a 0.
|
||||
NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
|
||||
// Handled below based on what the other operand is.
|
||||
NewElts[I] = Op;
|
||||
} else {
|
||||
NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
|
||||
}
|
||||
}
|
||||
|
||||
// If one half is undef, and one is constant, perfer a splat vector rather
|
||||
// than the normal qNaN. If it's a register, prefer 0.0 since that's
|
||||
// cheaper to use and may be free with a packed operation.
|
||||
if (NewElts[0].isUndef()) {
|
||||
if (isa<ConstantFPSDNode>(NewElts[1]))
|
||||
NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
|
||||
NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
|
||||
}
|
||||
|
||||
if (NewElts[1].isUndef()) {
|
||||
NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
|
||||
NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
|
||||
}
|
||||
|
||||
return DAG.getBuildVector(VT, SL, NewElts);
|
||||
}
|
||||
}
|
||||
|
@ -688,6 +688,38 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(<2 x half> addrspace(1)* %out,
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GFX9-NOT: [[A]]
|
||||
; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
|
||||
define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
|
||||
%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
|
||||
%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
|
||||
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
|
||||
%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
|
||||
|
||||
store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GFX9-NOT: [[A]]
|
||||
; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
|
||||
define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
|
||||
%out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
|
||||
%a = load <2 x half>, <2 x half> addrspace(1)* %gep0
|
||||
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
|
||||
%med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
|
||||
|
||||
store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
|
||||
; GCN: v_add_f32_e32 [[A:v[0-9]+]]
|
||||
; GCN: v_add_f32_e32 [[B:v[0-9]+]]
|
||||
|
@ -565,20 +565,71 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
|
||||
; GFX9: s_waitcnt
|
||||
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: s_waitcnt
|
||||
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64
|
||||
; GFX89: s_waitcnt
|
||||
; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX89-NEXT: s_setpc_b64
|
||||
define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
|
||||
%vec = insertelement <2 x half> undef, half %val, i32 1
|
||||
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
|
||||
ret <2 x half> %canonicalized
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
|
||||
; GCN: s_waitcnt
|
||||
; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
|
||||
; GFX89-NEXT: s_setpc_b64
|
||||
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 1.0
|
||||
; CI-NEXT: s_setpc_b64
|
||||
define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
|
||||
%vec = insertelement <2 x half> undef, half 1.0, i32 1
|
||||
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
|
||||
ret <2 x half> %canonicalized
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
|
||||
; GCN: s_waitcnt
|
||||
; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
|
||||
; GFX89-NEXT: s_setpc_b64
|
||||
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
|
||||
; CI-NEXT: s_setpc_b64
|
||||
define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
|
||||
%vec = insertelement <2 x half> undef, half 1.0, i32 0
|
||||
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
|
||||
ret <2 x half> %canonicalized
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
|
||||
; GCN: s_waitcnt
|
||||
; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
|
||||
; GFX89-NEXT: s_setpc_b64
|
||||
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
|
||||
; CI-NEXT: s_setpc_b64
|
||||
define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
|
||||
%vec = insertelement <2 x half> undef, half 16.0, i32 1
|
||||
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
|
||||
ret <2 x half> %canonicalized
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
|
||||
; GCN: s_waitcnt
|
||||
; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
|
||||
; GFX89-NEXT: s_setpc_b64
|
||||
|
||||
; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
|
||||
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
|
||||
; CI-NEXT: s_setpc_b64
|
||||
define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
|
||||
%vec = insertelement <2 x half> undef, half 16.0, i32 0
|
||||
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
|
||||
ret <2 x half> %canonicalized
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
|
||||
; GFX9: s_waitcnt
|
||||
; GFX9-DAG: v_max_f16_e32 v0, v0, v0
|
||||
|
Loading…
x
Reference in New Issue
Block a user