1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[AMDGPU] Tune threshold for cmp/select vector lowering

It was set in total vector size while the idea was to limit
a number of instructions. Now it started to work with doubles
and thresholds needs to be updated.

Differential Revision: https://reviews.llvm.org/D80322
This commit is contained in:
Stanislav Mekhanoshin 2020-05-20 11:51:07 -07:00
parent bd64a1c8db
commit 6d06d957a7
3 changed files with 64 additions and 19 deletions

View File

@ -9474,6 +9474,39 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
return SDValue();
}
// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
// expanded into a set of cmp/select instructions.
static bool shouldExpandVectorDynExt(SDNode *N) {
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
return false;
SDValue Vec = N->getOperand(0);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
unsigned NumElem = VecVT.getVectorNumElements();
// Sub-dword vectors of size 2 dword or less have better implementation.
if (VecSize <= 64 && EltSize < 32)
return false;
// Always expand the rest of sub-dword instructions, otherwise it will be
// lowered via memory.
if (EltSize < 32)
return true;
// Always do this if var-idx is divergent, otherwise it will become a loop.
if (Idx->isDivergent())
return true;
// Large vectors would yield too many compares and v_cndmask_b32 instructions.
unsigned NumInsts = NumElem /* Number of compares */ +
((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
return NumInsts <= 16;
}
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
@ -9535,15 +9568,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
unsigned EltSize = EltVT.getSizeInBits();
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
// This elminates non-constant index and subsequent movrel or scratch access.
// Sub-dword vectors of size 2 dword or less have better implementation.
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
// instructions.
// Always do this if var-idx is divergent, otherwise it will become a loop.
if (!UseDivergentRegisterIndexing &&
(VecSize <= 256 || N->getOperand(1)->isDivergent()) &&
(VecSize > 64 || EltSize >= 32) &&
!isa<ConstantSDNode>(N->getOperand(1))) {
if (shouldExpandVectorDynExt(N)) {
SDLoc SL(N);
SDValue Idx = N->getOperand(1);
SDValue V;
@ -9603,19 +9628,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
SDValue Idx = N->getOperand(2);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)
// This elminates non-constant index and subsequent movrel or scratch access.
// Sub-dword vectors of size 2 dword or less have better implementation.
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
// instructions.
// Always do this if var-idx is divergent, otherwise it will become a loop.
if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx) ||
(VecSize > 256 && !Idx->isDivergent()) ||
(VecSize <= 64 && EltSize < 32))
if (!shouldExpandVectorDynExt(N))
return SDValue();
SelectionDAG &DAG = DCI.DAG;

View File

@ -48,6 +48,24 @@ entry:
ret void
}
; GCN-LABEL: {{^}}double5_extelt:
; GCN-NOT: buffer_
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
; GCN: store_dwordx2 v[{{[0-9:]+}}]
define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
entry:
%ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
store double %ext, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}half4_extelt:
; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00

View File

@ -277,6 +277,17 @@ entry:
ret void
}
; GCN-LABEL: {{^}}double5_inselt:
; GCN-NOT: v_movrel
; GCN-NOT: buffer_
; GCN-COUNT-10: v_cndmask_b32
define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
entry:
%v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
store <5 x double> %v, <5 x double> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}double8_inselt:
; GCN-NOT: v_cndmask
; GCN-NOT: buffer_