[SVE][CodeGen] Fix scalable vector issues in DAGTypeLegalizer::GenWidenVectorLoads

In DAGTypeLegalizer::GenWidenVectorLoads the algorithm assumes it only ever deals with fixed width types, hence the offsets for each individual store never take 'vscale' into account. I've changed the code in that function to use TypeSize instead of unsigned for tracking the remaining load amount. In addition, I've changed the load loop to use the new IncrementPointer helper function for updating the addresses in each iteration, since this handles scalable vector types. Also, I've added report_fatal_errors in GenWidenVectorExtLoads, TargetLowering::scalarizeVectorLoad and TargetLowering::scalarizeVectorStores, since these functions currently use a sequence of element-by-element scalar loads/stores. In a similar vein, I've also added a fatal error report in FindMemType for the case when we decide to return the element type for a scalable vector type. I've added new tests in CodeGen/AArch64/sve-split-load.ll CodeGen/AArch64/sve-ld-addressing-mode-reg-imm.ll for the changes in GenWidenVectorLoads. Differential Revision: https://reviews.llvm.org/D85909
2024-10-19 11:02:59 +02:00 · 2020-08-12 14:16:22 +01:00 · 2020-08-12 14:16:22 +01:00 · f7a1832d69
commit f7a1832d69
parent 540752542a
5 changed files with 116 additions and 33 deletions
--- a/include/llvm/Support/TypeSize.h
+++ b/include/llvm/Support/TypeSize.h
@ -145,6 +145,24 @@ public:
    return *this;
  }

+  friend TypeSize operator-(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Arithmetic using mixed scalable and fixed types");
+    return {LHS.MinSize - RHS.MinSize, LHS.IsScalable};
+  }
+
+  friend TypeSize operator/(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Arithmetic using mixed scalable and fixed types");
+    return {LHS.MinSize / RHS.MinSize, LHS.IsScalable};
+  }
+
+  friend TypeSize operator%(const TypeSize &LHS, const TypeSize &RHS) {
+    assert(LHS.IsScalable == RHS.IsScalable &&
+           "Arithmetic using mixed scalable and fixed types");
+    return {LHS.MinSize % RHS.MinSize, LHS.IsScalable};
+  }
+
  // Return the minimum size with the assumption that the size is exact.
  // Use in places where a scalable size doesn't make sense (e.g. non-vector
  // types, or vectors in backends which don't support scalable vectors).
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -4895,11 +4895,14 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
        isPowerOf2_32(WidenWidth / MemVTWidth) &&
        (MemVTWidth <= Width ||
         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
-      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
+      if (RetVT.getSizeInBits().getFixedSize() < MemVTWidth || MemVT == WidenVT)
        return MemVT;
    }
  }

+  if (Scalable)
+    report_fatal_error("Using element-wise loads and stores for widening "
+                       "operations is not supported for scalable vectors");
  return RetVT;
 }

@ -4942,10 +4945,10 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
  // element type or scalar loads and then recombines it to the widen vector
  // type.
  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
-  unsigned WidenWidth = WidenVT.getSizeInBits();
  EVT LdVT    = LD->getMemoryVT();
  SDLoc dl(LD);
  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());

  // Load information
@ -4954,15 +4957,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
  AAMDNodes AAInfo = LD->getAAInfo();

-  int LdWidth = LdVT.getSizeInBits();
-  int WidthDiff = WidenWidth - LdWidth;
+  TypeSize LdWidth = LdVT.getSizeInBits();
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  TypeSize WidthDiff = WidenWidth - LdWidth;
  // Allow wider loads if they are sufficiently aligned to avoid memory faults
  // and if the original load is simple.
  unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();

  // Find the vector type that can load from.
-  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
-  int NewVTWidth = NewVT.getSizeInBits();
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                          WidthDiff.getKnownMinSize());
+  TypeSize NewVTWidth = NewVT.getSizeInBits();
  SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                             LD->getOriginalAlign(), MMOFlags, AAInfo);
  LdChain.push_back(LdOp.getValue(1));
@ -4970,7 +4975,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
  // Check if we can load the element with one instruction.
  if (LdWidth <= NewVTWidth) {
    if (!NewVT.isVector()) {
-      unsigned NumElts = WidenWidth / NewVTWidth;
+      unsigned NumElts = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
      SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
      return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
@ -4978,8 +4983,9 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
    if (NewVT == WidenVT)
      return LdOp;

-    assert(WidenWidth % NewVTWidth == 0);
-    unsigned NumConcat = WidenWidth / NewVTWidth;
+    // TODO: We don't currently have any tests that exercise this code path.
+    assert(WidenWidth.getFixedSize() % NewVTWidth.getFixedSize() == 0);
+    unsigned NumConcat = WidenWidth.getFixedSize() / NewVTWidth.getFixedSize();
    SmallVector<SDValue, 16> ConcatOps(NumConcat);
    SDValue UndefVal = DAG.getUNDEF(NewVT);
    ConcatOps[0] = LdOp;
@ -4992,35 +4998,30 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
  SmallVector<SDValue, 16> LdOps;
  LdOps.push_back(LdOp);

-  LdWidth -= NewVTWidth;
-  unsigned Offset = 0;
+  uint64_t ScaledOffset = 0;
+  MachinePointerInfo MPI = LD->getPointerInfo();
+  do {
+    LdWidth -= NewVTWidth;
+    IncrementPointer(cast<LoadSDNode>(LdOp), NewVT, MPI, BasePtr,
+                     &ScaledOffset);

-  while (LdWidth > 0) {
-    unsigned Increment = NewVTWidth / 8;
-    Offset += Increment;
-    BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, TypeSize::Fixed(Increment));
-
-    SDValue L;
    if (LdWidth < NewVTWidth) {
      // The current type we are using is too large. Find a better size.
-      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+      NewVT = FindMemType(DAG, TLI, LdWidth.getKnownMinSize(), WidenVT, LdAlign,
+                          WidthDiff.getKnownMinSize());
      NewVTWidth = NewVT.getSizeInBits();
-      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset),
-                      LD->getOriginalAlign(), MMOFlags, AAInfo);
-      LdChain.push_back(L.getValue(1));
-    } else {
-      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                      LD->getPointerInfo().getWithOffset(Offset),
-                      LD->getOriginalAlign(), MMOFlags, AAInfo);
-      LdChain.push_back(L.getValue(1));
    }

+    Align NewAlign = ScaledOffset == 0
+                         ? LD->getOriginalAlign()
+                         : commonAlignment(LD->getAlign(), ScaledOffset);
+    SDValue L =
+        DAG.getLoad(NewVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo);
+    LdChain.push_back(L.getValue(1));
+
    LdOps.push_back(L);
    LdOp = L;
-
-    LdWidth -= NewVTWidth;
-  }
+  } while (LdWidth > NewVTWidth);

  // Build the vector from the load operations.
  unsigned End = LdOps.size();
@ -5044,13 +5045,17 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
    }
    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
  }
+
  ConcatOps[--Idx] = LdOps[i];
  for (--i; i >= 0; --i) {
    EVT NewLdTy = LdOps[i].getValueType();
    if (NewLdTy != LdTy) {
      // Create a larger vector.
-      unsigned NumOps = NewLdTy.getSizeInBits() / LdTy.getSizeInBits();
-      assert(NewLdTy.getSizeInBits() % LdTy.getSizeInBits() == 0);
+      unsigned NumOps =
+          (NewLdTy.getSizeInBits() / LdTy.getSizeInBits()).getKnownMinSize();
+      assert(
+          (NewLdTy.getSizeInBits() % LdTy.getSizeInBits()).getKnownMinSize() ==
+          0);
      SmallVector<SDValue, 16> WidenOps(NumOps);
      unsigned j = 0;
      for (; j != End-Idx; ++j)
@ -5071,7 +5076,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                       makeArrayRef(&ConcatOps[Idx], End - Idx));

  // We need to fill the rest with undefs to build the vector.
-  unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+  unsigned NumOps = (WidenWidth / LdTy.getSizeInBits()).getKnownMinSize();
  SmallVector<SDValue, 16> WidenOps(NumOps);
  SDValue UndefVal = DAG.getUNDEF(LdTy);
  {
@ -5094,6 +5099,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
  EVT LdVT    = LD->getMemoryVT();
  SDLoc dl(LD);
  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());

  // Load information
  SDValue Chain = LD->getChain();
@ -5101,6 +5107,10 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
  AAMDNodes AAInfo = LD->getAAInfo();

+  if (LdVT.isScalableVector())
+    report_fatal_error("Generating widen scalable extending vector loads is "
+                       "not yet supported");
+
  EVT EltVT = WidenVT.getVectorElementType();
  EVT LdEltVT = LdVT.getVectorElementType();
  unsigned NumElts = LdVT.getVectorNumElements();
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -6724,6 +6724,9 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
  EVT DstVT = LD->getValueType(0);
  ISD::LoadExtType ExtType = LD->getExtensionType();

+  if (SrcVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector loads");
+
  unsigned NumElem = SrcVT.getVectorNumElements();

  EVT SrcEltVT = SrcVT.getScalarType();
@ -6811,6 +6814,9 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
  SDValue Value = ST->getValue();
  EVT StVT = ST->getMemoryVT();

+  if (StVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector stores");
+
  // The type of the data we want to save
  EVT RegVT = Value.getValueType();
  EVT RegSclVT = RegVT.getScalarType();
--- a/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
+++ b/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
@ -104,3 +104,40 @@ define <vscale x 2 x i64> @ld1d_inbound(<vscale x 2 x i64>* %a) {
  %load = load <vscale x 2 x i64>, <vscale x 2 x i64>* %base
  ret <vscale x 2 x i64> %load
 }
+
+define void @load_nxv6f16(<vscale x 6 x half>* %a) {
+; CHECK-LABEL: load_nxv6f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load volatile <vscale x 6 x half>, <vscale x 6 x half>* %a
+  ret void
+}
+
+define void @load_nxv6f32(<vscale x 6 x float>* %a) {
+; CHECK-LABEL: load_nxv6f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load volatile <vscale x 6 x float>, <vscale x 6 x float>* %a
+  ret void
+}
+
+define void @load_nxv12f16(<vscale x 12 x half>* %a) {
+; CHECK-LABEL: load_nxv12f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load volatile <vscale x 12 x half>, <vscale x 12 x half>* %a
+  ret void
+}
+
--- a/test/CodeGen/AArch64/sve-split-load.ll
+++ b/test/CodeGen/AArch64/sve-split-load.ll
@ -24,6 +24,18 @@ define <vscale x 16 x i16> @load_split_i16(<vscale x 16 x i16>* %a) {
  ret <vscale x 16 x i16> %load
 }

+define <vscale x 24 x i16> @load_split_24i16(<vscale x 24 x i16>* %a) {
+; CHECK-LABEL: load_split_24i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ret
+  %load = load <vscale x 24 x i16>, <vscale x 24 x i16>* %a
+  ret <vscale x 24 x i16> %load
+}
+
 define <vscale x 32 x i16> @load_split_32i16(<vscale x 32 x i16>* %a) {
 ; CHECK-LABEL: load_split_32i16:
 ; CHECK:       // %bb.0: