[CodeGen] Fix issues with scalable-vector INSERT/EXTRACT_SUBVECTORs

This patch addresses a few issues when dealing with scalable-vector INSERT_SUBVECTOR and EXTRACT_SUBVECTOR nodes. When legalizing in DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR, we store the low and high halves to the stack separately. The offset for the high half was calculated incorrectly. Additionally, we can optimize this process when we can detect that the subvector is contained entirely within the low/high split vector type. While this optimization is valid on scalable vectors, when performing the 'high' optimization, the subvector must also be a scalable vector. Note that the 'low' optimization is still conservative: it may be possible to insert v2i32 into the low half of a split nxv1i32/nxv1i32, but we can't guarantee it. It is always possible to insert v2i32 into nxv2i32 or v2i32 into nxv4i32+2 as we know vscale is at least 1. Lastly, in SelectionDAG::isSplatValue, we early-exit on the extracted subvector value type being a scalable vector, forgetting that we can also extract a fixed-length vector from a scalable one. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98495
2024-11-23 03:02:36 +01:00 · 2021-03-12 11:22:29 +00:00 · 2021-03-12 11:22:29 +00:00 · 33c1eeca58
commit 33c1eeca58
parent 4f01a02f9f
4 changed files with 149 additions and 10 deletions
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -1273,20 +1273,24 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,

  EVT VecVT = Vec.getValueType();
  EVT LoVT = Lo.getValueType();
-  unsigned VecElems = VecVT.getVectorNumElements();
-  unsigned SubElems = SubVec.getValueType().getVectorNumElements();
-  unsigned LoElems = LoVT.getVectorNumElements();
+  EVT SubVecVT = SubVec.getValueType();
+  unsigned VecElems = VecVT.getVectorMinNumElements();
+  unsigned SubElems = SubVecVT.getVectorMinNumElements();
+  unsigned LoElems = LoVT.getVectorMinNumElements();

  // If we know the index is in the first half, and we know the subvector
  // doesn't cross the boundary between the halves, we can avoid spilling the
  // vector, and insert into the lower half of the split vector directly.
-  // Similarly if the subvector is fully in the high half.
  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
  if (IdxVal + SubElems <= LoElems) {
    Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
    return;
  }
-  if (IdxVal >= LoElems && IdxVal + SubElems <= VecElems) {
+  // Similarly if the subvector is fully in the high half, but mind that we
+  // can't tell whether a fixed-length subvector is fully within the high half
+  // of a scalable vector.
+  if (VecVT.isScalableVector() == SubVecVT.isScalableVector() &&
+      IdxVal >= LoElems && IdxVal + SubElems <= VecElems) {
    Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, Hi.getValueType(), Hi, SubVec,
                     DAG.getVectorIdxConstant(IdxVal - LoElems, dl));
    return;
@ -1315,13 +1319,12 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
                   SmallestAlign);

  // Increment the pointer to the other part.
-  unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
-  StackPtr =
-      DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
+  auto *Load = cast<LoadSDNode>(Lo);
+  MachinePointerInfo MPI = Load->getPointerInfo();
+  IncrementPointer(Load, LoVT, MPI, StackPtr);

  // Load the Hi part from the stack slot.
-  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MPI, SmallestAlign);
 }

 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -2524,6 +2524,9 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
  case ISD::EXTRACT_SUBVECTOR: {
    // Offset the demanded elts by the subvector index.
    SDValue Src = V.getOperand(0);
+    // We don't support scalable vectors at the moment.
+    if (Src.getValueType().isScalableVector())
+      return false;
    uint64_t Idx = V.getConstantOperandVal(1);
    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
    APInt UndefSrcElts;
--- a/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@ -553,6 +553,99 @@ define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, <8 x i1>*
  ret <vscale x 8 x i1> %c
 }

+declare <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
+
+define void @insert_v2i64_nxv16i64(<2 x i64>* %psv0, <2 x i64>* %psv1, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a3, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    vsetivli a0, 6, e64,m8,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-NEXT:    vs8r.v v8, (a2)
+; CHECK-NEXT:    ret
+  %sv0 = load <2 x i64>, <2 x i64>* %psv0
+  %sv1 = load <2 x i64>, <2 x i64>* %psv1
+  %v0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_v2i64_nxv16i64_lo0(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a1)
+; CHECK-NEXT:    ret
+  %sv = load <2 x i64>, <2 x i64>* %psv
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vsetivli a0, 4, e64,m8,ta,mu
+; CHECK-NEXT:    vslideup.vi v16, v8, 2
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    ret
+  %sv = load <2 x i64>, <2 x i64>* %psv
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+; Check we don't mistakenly optimize this: we don't know whether this is
+; inserted into the low or high split vector.
+define void @insert_v2i64_nxv16i64_hi(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_v2i64_nxv16i64_hi:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    vsetivli a2, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vle64.v v25, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a2, a0, 4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    addi a3, zero, 8
+; CHECK-NEXT:    bltu a2, a3, .LBB29_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addi a2, zero, 8
+; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vsetivli a4, 2, e64,m1,ta,mu
+; CHECK-NEXT:    vse64.v v25, (a2)
+; CHECK-NEXT:    slli a0, a0, 6
+; CHECK-NEXT:    add a2, a3, a0
+; CHECK-NEXT:    vl8re64.v v8, (a2)
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %sv = load <2 x i64>, <2 x i64>* %psv
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
 declare <8 x i1> @llvm.experimental.vector.insert.v4i1.v8i1(<8 x i1>, <4 x i1>, i64)
 declare <32 x i1> @llvm.experimental.vector.insert.v8i1.v32i1(<32 x i1>, <8 x i1>, i64)

--- a/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/test/CodeGen/RISCV/rvv/insert-subvector.ll
@ -458,6 +458,46 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_2(<vscale x 4 x i1> %v, <vscale x
  ret <vscale x 4 x i1> %vec
 }

+declare <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
+
+define void @insert_nxv8i64_nxv16i64(<vscale x 8 x i64> %sv0, <vscale x 8 x i64> %sv1, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_nxv8i64_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vs8r.v v16, (a0)
+; CHECK-NEXT:    ret
+  %v0 = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %v0, <vscale x 8 x i64> %sv1, i64 8)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_nxv8i64_nxv16i64_lo(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
+define void @insert_nxv8i64_nxv16i64_hi(<vscale x 8 x i64> %sv0, <vscale x 16 x i64>* %out) {
+; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i64> @llvm.experimental.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 8)
+  store <vscale x 16 x i64> %v, <vscale x 16 x i64>* %out
+  ret void
+}
+
 declare <vscale x 4 x i1> @llvm.experimental.vector.insert.nxv1i1.nxv4i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
 declare <vscale x 32 x i1> @llvm.experimental.vector.insert.nxv8i1.nxv32i1(<vscale x 32 x i1>, <vscale x 8 x i1>, i64)