[SVE][CodeGen] Fix DAGCombiner::ForwardStoreValueToDirectLoad for scalable vectors

In DAGCombiner::ForwardStoreValueToDirectLoad I have fixed up some implicit casts from TypeSize -> uint64_t and replaced calls to getVectorNumElements() with getVectorElementCount(). There are some simple cases of forwarding that we can definitely support for scalable vectors, i.e. when the store and load are both scalable vectors and have the same size. I have added tests for the new code paths here: CodeGen/AArch64/sve-forward-st-to-ld.ll Differential Revision: https://reviews.llvm.org/D87098
2024-11-24 11:42:57 +01:00 · 2020-09-03 08:28:57 +01:00 · 2020-09-03 08:28:57 +01:00 · 4bc4e2e524
commit 4bc4e2e524
parent 88c849a4b2
2 changed files with 135 additions and 10 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -14792,8 +14792,8 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }

-static inline int numVectorEltsOrZero(EVT T) {
-  return T.isVector() ? T.getVectorNumElements() : 0;
+static inline ElementCount numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
 }

 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
@ -14861,6 +14861,24 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
  EVT STMemType = ST->getMemoryVT();
  EVT STType = ST->getValue().getValueType();

+  // There are two cases to consider here:
+  //  1. The store is fixed width and the load is scalable. In this case we
+  //     don't know at compile time if the store completely envelops the load
+  //     so we abandon the optimisation.
+  //  2. The store is scalable and the load is fixed width. We could
+  //     potentially support a limited number of cases here, but there has been
+  //     no cost-benefit analysis to prove it's worth it.
+  bool LdStScalable = LDMemType.isScalableVector();
+  if (LdStScalable != STMemType.isScalableVector())
+    return SDValue();
+
+  // If we are dealing with scalable vectors on a big endian platform the
+  // calculation of offsets below becomes trickier, since we do not know at
+  // compile time the absolute size of the vector. Until we've done more
+  // analysis on big-endian platforms it seems better to bail out for now.
+  if (LdStScalable && DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
  BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
  BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
  int64_t Offset;
@ -14872,13 +14890,21 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
  // the stored value). With Offset=n (for n > 0) the loaded value starts at the
  // n:th least significant byte of the stored value.
  if (DAG.getDataLayout().isBigEndian())
-    Offset = ((int64_t)STMemType.getStoreSizeInBits() -
-              (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;
+    Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
+              (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
+                 8 -
+             Offset;

  // Check that the stored value cover all bits that are loaded.
-  bool STCoversLD =
-      (Offset >= 0) &&
-      (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+  bool STCoversLD;
+
+  TypeSize LdMemSize = LDMemType.getSizeInBits();
+  TypeSize StMemSize = STMemType.getSizeInBits();
+  if (LdStScalable)
+    STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
+  else
+    STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
+                                   StMemSize.getFixedSize());

  auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
    if (LD->isIndexed()) {
@ -14899,15 +14925,15 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
  // Memory as copy space (potentially masked).
  if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
    // Simple case: Direct non-truncating forwarding
-    if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
+    if (LDType.getSizeInBits() == LdMemSize)
      return ReplaceLd(LD, ST->getValue(), Chain);
    // Can we model the truncate and extension with an and mask?
    if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
        !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
      // Mask to size of LDMemType
      auto Mask =
-          DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
-                                               STMemType.getSizeInBits()),
+          DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
+                                               StMemSize.getFixedSize()),
                          SDLoc(ST), STType);
      auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
      return ReplaceLd(LD, Val, Chain);
--- a/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
+++ b/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define <vscale x 2 x i64> @sti64ldi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  %0 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x double> @stf64ldf64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
+; CHECK-LABEL: stf64ldf64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  %0 = load <vscale x 2 x double>, <vscale x 2 x double>* %arrayidx1
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x i64> @sti32ldi32ext(<vscale x 2 x i32>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti32ldi32ext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z1.d, p0/m, z0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc <vscale x 2 x i64> %v to <vscale x 2 x i32>
+  store <vscale x 2 x i32> %0, <vscale x 2 x i32>* %P
+  %1 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %P
+  %2 = sext <vscale x 2 x i32> %1 to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %2
+}
+
+define <2 x i64> @sti64ldfixedi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldfixedi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = bitcast <vscale x 2 x i64>* %arrayidx0 to <2 x i64>*
+  %0 = load <2 x i64>, <2 x i64>* %arrayidx1
+  ret <2 x i64> %0
+}
+
+define <vscale x 4 x i32> @sti64ldi32(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldi32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast <vscale x 2 x i64>* %P to <vscale x 4 x i32>*
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 4 x i32>, <vscale x 4 x i32>* %0, i64 1
+  %1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %arrayidx1
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @stf64ldi64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
+; CHECK-LABEL: stf64ldi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast <vscale x 2 x double>* %P to <vscale x 2 x i64>*
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %0, i64 1
+  %1 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
+  ret <vscale x 2 x i64> %1
+}