mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[AMDGPU] Tune threshold for cmp/select vector lowering
It was set in total vector size while the idea was to limit a number of instructions. Now it started to work with doubles and thresholds needs to be updated. Differential Revision: https://reviews.llvm.org/D80322
This commit is contained in:
parent
bd64a1c8db
commit
6d06d957a7
@ -9474,6 +9474,39 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
|
||||
// expanded into a set of cmp/select instructions.
|
||||
static bool shouldExpandVectorDynExt(SDNode *N) {
|
||||
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
|
||||
if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
|
||||
return false;
|
||||
|
||||
SDValue Vec = N->getOperand(0);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
EVT EltVT = VecVT.getVectorElementType();
|
||||
unsigned VecSize = VecVT.getSizeInBits();
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
unsigned NumElem = VecVT.getVectorNumElements();
|
||||
|
||||
// Sub-dword vectors of size 2 dword or less have better implementation.
|
||||
if (VecSize <= 64 && EltSize < 32)
|
||||
return false;
|
||||
|
||||
// Always expand the rest of sub-dword instructions, otherwise it will be
|
||||
// lowered via memory.
|
||||
if (EltSize < 32)
|
||||
return true;
|
||||
|
||||
// Always do this if var-idx is divergent, otherwise it will become a loop.
|
||||
if (Idx->isDivergent())
|
||||
return true;
|
||||
|
||||
// Large vectors would yield too many compares and v_cndmask_b32 instructions.
|
||||
unsigned NumInsts = NumElem /* Number of compares */ +
|
||||
((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
|
||||
return NumInsts <= 16;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performExtractVectorEltCombine(
|
||||
SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
SDValue Vec = N->getOperand(0);
|
||||
@ -9535,15 +9568,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
|
||||
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
|
||||
// This elminates non-constant index and subsequent movrel or scratch access.
|
||||
// Sub-dword vectors of size 2 dword or less have better implementation.
|
||||
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
|
||||
// instructions.
|
||||
// Always do this if var-idx is divergent, otherwise it will become a loop.
|
||||
if (!UseDivergentRegisterIndexing &&
|
||||
(VecSize <= 256 || N->getOperand(1)->isDivergent()) &&
|
||||
(VecSize > 64 || EltSize >= 32) &&
|
||||
!isa<ConstantSDNode>(N->getOperand(1))) {
|
||||
if (shouldExpandVectorDynExt(N)) {
|
||||
SDLoc SL(N);
|
||||
SDValue Idx = N->getOperand(1);
|
||||
SDValue V;
|
||||
@ -9603,19 +9628,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
|
||||
SDValue Idx = N->getOperand(2);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
EVT EltVT = VecVT.getVectorElementType();
|
||||
unsigned VecSize = VecVT.getSizeInBits();
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
|
||||
// INSERT_VECTOR_ELT (<n x e>, var-idx)
|
||||
// => BUILD_VECTOR n x select (e, const-idx)
|
||||
// This elminates non-constant index and subsequent movrel or scratch access.
|
||||
// Sub-dword vectors of size 2 dword or less have better implementation.
|
||||
// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
|
||||
// instructions.
|
||||
// Always do this if var-idx is divergent, otherwise it will become a loop.
|
||||
if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx) ||
|
||||
(VecSize > 256 && !Idx->isDivergent()) ||
|
||||
(VecSize <= 64 && EltSize < 32))
|
||||
if (!shouldExpandVectorDynExt(N))
|
||||
return SDValue();
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
@ -48,6 +48,24 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}double5_extelt:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
|
||||
; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
|
||||
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
|
||||
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
|
||||
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
|
||||
; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
|
||||
; GCN: store_dwordx2 v[{{[0-9:]+}}]
|
||||
define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
|
||||
entry:
|
||||
%ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
|
||||
store double %ext, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}half4_extelt:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
|
||||
|
@ -277,6 +277,17 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}double5_inselt:
|
||||
; GCN-NOT: v_movrel
|
||||
; GCN-NOT: buffer_
|
||||
; GCN-COUNT-10: v_cndmask_b32
|
||||
define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
|
||||
entry:
|
||||
%v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
|
||||
store <5 x double> %v, <5 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}double8_inselt:
|
||||
; GCN-NOT: v_cndmask
|
||||
; GCN-NOT: buffer_
|
||||
|
Loading…
x
Reference in New Issue
Block a user