[AMDGPU] Tune threshold for cmp/select vector lowering

It was set in total vector size while the idea was to limit a number of instructions. Now it started to work with doubles and thresholds needs to be updated. Differential Revision: https://reviews.llvm.org/D80322
2025-01-31 12:41:49 +01:00 · 2020-05-20 11:51:07 -07:00 · 2020-05-20 11:51:07 -07:00 · 6d06d957a7
commit 6d06d957a7
parent bd64a1c8db
3 changed files with 64 additions and 19 deletions
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -9474,6 +9474,39 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
  return SDValue();
 }

+// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+// expanded into a set of cmp/select instructions.
+static bool shouldExpandVectorDynExt(SDNode *N) {
+  SDValue Idx = N->getOperand(N->getNumOperands() - 1);
+  if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
+    return false;
+
+  SDValue Vec = N->getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned VecSize = VecVT.getSizeInBits();
+  unsigned EltSize = EltVT.getSizeInBits();
+  unsigned NumElem = VecVT.getVectorNumElements();
+
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  if (VecSize <= 64 && EltSize < 32)
+    return false;
+
+  // Always expand the rest of sub-dword instructions, otherwise it will be
+  // lowered via memory.
+  if (EltSize < 32)
+    return true;
+
+  // Always do this if var-idx is divergent, otherwise it will become a loop.
+  if (Idx->isDivergent())
+    return true;
+
+  // Large vectors would yield too many compares and v_cndmask_b32 instructions.
+  unsigned NumInsts = NumElem /* Number of compares */ +
+                      ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
+  return NumInsts <= 16;
+}
+
 SDValue SITargetLowering::performExtractVectorEltCombine(
  SDNode *N, DAGCombinerInfo &DCI) const {
  SDValue Vec = N->getOperand(0);
@ -9535,15 +9568,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
  unsigned EltSize = EltVT.getSizeInBits();

  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
-  // This elminates non-constant index and subsequent movrel or scratch access.
-  // Sub-dword vectors of size 2 dword or less have better implementation.
-  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
-  // instructions.
-  // Always do this if var-idx is divergent, otherwise it will become a loop.
-  if (!UseDivergentRegisterIndexing &&
-      (VecSize <= 256 || N->getOperand(1)->isDivergent()) &&
-      (VecSize > 64 || EltSize >= 32) &&
-      !isa<ConstantSDNode>(N->getOperand(1))) {
+  if (shouldExpandVectorDynExt(N)) {
    SDLoc SL(N);
    SDValue Idx = N->getOperand(1);
    SDValue V;
@ -9603,19 +9628,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
  SDValue Idx = N->getOperand(2);
  EVT VecVT = Vec.getValueType();
  EVT EltVT = VecVT.getVectorElementType();
-  unsigned VecSize = VecVT.getSizeInBits();
-  unsigned EltSize = EltVT.getSizeInBits();

  // INSERT_VECTOR_ELT (<n x e>, var-idx)
  // => BUILD_VECTOR n x select (e, const-idx)
-  // This elminates non-constant index and subsequent movrel or scratch access.
-  // Sub-dword vectors of size 2 dword or less have better implementation.
-  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
-  // instructions.
-  // Always do this if var-idx is divergent, otherwise it will become a loop.
-  if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx) ||
-      (VecSize > 256 && !Idx->isDivergent()) ||
-      (VecSize <= 64 && EltSize < 32))
+  if (!shouldExpandVectorDynExt(N))
    return SDValue();

  SelectionDAG &DAG = DCI.DAG;
--- a/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@ -48,6 +48,24 @@ entry:
  ret void
 }

+; GCN-LABEL: {{^}}double5_extelt:
+; GCN-NOT: buffer_
+; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
+; GCN: store_dwordx2 v[{{[0-9:]+}}]
+define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
+entry:
+  %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
+  store double %ext, double addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}half4_extelt:
 ; GCN-NOT: buffer_
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
--- a/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@ -277,6 +277,17 @@ entry:
  ret void
 }

+; GCN-LABEL: {{^}}double5_inselt:
+; GCN-NOT: v_movrel
+; GCN-NOT: buffer_
+; GCN-COUNT-10: v_cndmask_b32
+define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
+entry:
+  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
+  store <5 x double> %v, <5 x double> addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}double8_inselt:
 ; GCN-NOT: v_cndmask
 ; GCN-NOT: buffer_