[AArch64] Gangup loads and stores for pairing.

Keep loads and stores together (target defines how many loads and stores to gang up), such that it will help in pairing and vectorization. Differential Revision https://reviews.llvm.org/D46477 llvm-svn: 332482
2024-11-25 20:23:11 +01:00 · 2018-05-16 15:36:52 +00:00 · 2018-05-16 15:36:52 +00:00 · f2ba0203b0
commit f2ba0203b0
parent 57f6c9317c
10 changed files with 122 additions and 27 deletions
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@ -1199,6 +1199,15 @@ public:
    return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
  }

+  /// \brief Get maximum # of store operations to be glued together
+  ///
+  /// This function returns the maximum number of store operations permitted
+  /// to glue together during lowering of llvm.memcpy. The value is set by
+  //  the target at the performance threshold for such a replacement.
+  virtual unsigned getMaxGluedStoresPerMemcpy() const {
+    return MaxGluedStoresPerMemcpy;
+  }
+
  /// Get maximum # of load operations permitted for memcmp
  ///
  /// This function returns the maximum number of load operations permitted
@ -2509,6 +2518,14 @@ protected:
  /// constant size.
  unsigned MaxStoresPerMemcpy;

+
+  /// \brief Specify max number of store instructions to glue in inlined memcpy.
+  ///
+  /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
+  /// of store instructions to keep together. This helps in pairing and
+  //  vectorization later on.
+  unsigned MaxGluedStoresPerMemcpy = 0;
+
  /// Maximum number of store operations that may be substituted for a call to
  /// memcpy, used for functions with OptSize attribute.
  unsigned MaxStoresPerMemcpyOptSize;
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -89,6 +89,14 @@ void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}

 #define DEBUG_TYPE "selectiondag"

+static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
+       cl::Hidden, cl::init(true),
+       cl::desc("Gang up loads and stores generated by inlining of memcpy"));
+
+static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
+       cl::desc("Number limit for gluing ld/st of memcpy."),
+       cl::Hidden, cl::init(0));
+
 static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
  LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
 }
@ -5218,6 +5226,31 @@ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
  return MF.getFunction().optForSize();
 }

+static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                          SmallVector<SDValue, 32> &OutChains, unsigned From,
+                          unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
+                          SmallVector<SDValue, 16> &OutStoreChains) {
+  assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
+  assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
+  SmallVector<SDValue, 16> GluedLoadChains;
+  for (unsigned i = From; i < To; ++i) {
+    OutChains.push_back(OutLoadChains[i]);
+    GluedLoadChains.push_back(OutLoadChains[i]);
+  }
+
+  // Chain for all loads.
+  SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  GluedLoadChains);
+
+  for (unsigned i = From; i < To; ++i) {
+    StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
+    SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+    OutChains.push_back(NewStore);
+  }
+}
+
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                       SDValue Chain, SDValue Dst, SDValue Src,
                                       uint64_t Size, unsigned Align,
@ -5282,7 +5315,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,

  MachineMemOperand::Flags MMOFlags =
      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
-  SmallVector<SDValue, 8> OutChains;
+  SmallVector<SDValue, 16> OutLoadChains;
+  SmallVector<SDValue, 16> OutStoreChains;
+  SmallVector<SDValue, 32> OutChains;
  unsigned NumMemOps = MemOps.size();
  uint64_t SrcOff = 0, DstOff = 0;
  for (unsigned i = 0; i != NumMemOps; ++i) {
@ -5316,11 +5351,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
        SubSlice.Length = VTSize;
      }
      Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
-      if (Value.getNode())
+      if (Value.getNode()) {
        Store = DAG.getStore(Chain, dl, Value,
                             DAG.getMemBasePlusOffset(Dst, DstOff, dl),
                             DstPtrInfo.getWithOffset(DstOff), Align,
                             MMOFlags);
+        OutChains.push_back(Store);
+      }
    }

    if (!Store.getNode()) {
@ -5342,17 +5379,61 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                             DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                             SrcPtrInfo.getWithOffset(SrcOff), VT,
                             MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
-      OutChains.push_back(Value.getValue(1));
+      OutLoadChains.push_back(Value.getValue(1));
+
      Store = DAG.getTruncStore(
          Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
          DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+      OutStoreChains.push_back(Store);
    }
-    OutChains.push_back(Store);
    SrcOff += VTSize;
    DstOff += VTSize;
    Size -= VTSize;
  }

+  unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
+                                TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
+  unsigned NumLdStInMemcpy = OutStoreChains.size();
+
+  if (NumLdStInMemcpy) {
+    // It may be that memcpy might be converted to memset if it's memcpy
+    // of constants. In such a case, we won't have loads and stores, but
+    // just stores. In the absence of loads, there is nothing to gang up.
+    if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
+      // If target does not care, just leave as it.
+      for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
+        OutChains.push_back(OutLoadChains[i]);
+        OutChains.push_back(OutStoreChains[i]);
+      }
+    } else {
+      // Ld/St less than/equal limit set by target.
+      if (NumLdStInMemcpy <= GluedLdStLimit) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        NumLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+      } else {
+        unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
+        unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
+        unsigned GlueIter = 0;
+
+        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+          unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                       OutLoadChains, OutStoreChains);
+          GlueIter += GluedLdStLimit;
+        }
+
+        // Residual ld/st.
+        if (RemainingLdStInMemcpy) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        RemainingLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+        }
+      }
+    }
+  }
  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }

--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -535,6 +535,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
  // Perform these initializations only once.
  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
      MaxLoadsPerMemcmp = 8;
+  MaxGluedStoresPerMemcpy = 0;
  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
      MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
  UseUnderscoreSetJmp = false;
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -580,6 +580,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  setTargetDAGCombine(ISD::GlobalAddress);

  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxGluedStoresPerMemcpy = 4;
+
  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;

--- a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
@ -8,9 +8,9 @@

 ; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
 ; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
 ; CHECK-NEXT: str  [[VAL]], [x0, #8]
-; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: str  [[VAL2]], [x0]

 define void @foo(i8* %a) {
--- a/test/CodeGen/AArch64/arm64-memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@ -29,10 +29,10 @@ entry:
 define void @t1(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t1:
-; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
-; CHECK: stur [[DEST]], [x0, #15]
 ; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST:q[0-9]+]], [x0, #15]
+; CHECK: str [[DEST:q[0-9]+]], [x0]
  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i1 false)
  ret void
 }
@ -52,9 +52,9 @@ entry:
 define void @t3(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t3:
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
 ; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
 ; CHECK: str [[REG4]], [x0, #16]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
 ; CHECK: str [[DEST]], [x0]
  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i1 false)
  ret void
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@ -130,12 +130,9 @@ define void @test_va_copy() {

 ; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var

-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: ldp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[SRC]]]
 ; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
-; CHECK: str [[BLOCK]], [x[[DST]]]
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
-; CHECK: str [[BLOCK]], [x[[DST]], #16]
+; CHECK: stp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[DST]]]
  ret void
 ; CHECK: ret
 }
--- a/test/CodeGen/AArch64/arm64-virtual_base.ll
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
@ -34,8 +34,8 @@
 define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
 ; CHECK: Precompute_Patch_Values
 ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: str [[VAL]], [sp, #232]
 ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
 ; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
 entry:
  %Control_Points = alloca [16 x [3 x double]], align 8
--- a/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
+++ b/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
@ -4,16 +4,14 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios10.0.0"

 ; PR33475 - Expect 64-bit operations as 128-operations are not legal
+; However, we can generate a paired 64-bit loads and stores, without using
+; floating point registers.

 ; CHECK-LABEL: pr33475
-; CHECK-DAG: ldr [[R0:x[0-9]+]], [x1]
-; CHECK-DAG: str [[R0]], [x0]
-; CHECK-DAG: ldr [[R1:x[0-9]+]], [x1, #8]
-; CHECK-DAG: str [[R1]], [x0, #8]
-; CHECK-DAG: ldr [[R2:x[0-9]+]], [x1, #16]
-; CHECK-DAG: str [[R2]], [x0, #16]
-; CHECK-DAG: ldr [[R3:x[0-9]+]], [x1, #24]
-; CHECK-DAG: str [[R3]], [x0, #24]
+; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1, #16]
+; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1]
+; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0, #16]
+; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0]

 define void @pr33475(i8* %p0, i8* %p1) noimplicitfloat {
    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p0, i8* align 4 %p1, i64 32, i1 false)
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@ -44,15 +44,14 @@ entry:
 define void @t2(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: movs [[INC:r[0-9]+]], #32
-; CHECK: add.w   r3, r0, #16
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
 ; CHECK: movw [[REG2:r[0-9]+]], #16716
 ; CHECK: movt [[REG2:r[0-9]+]], #72
 ; CHECK: str [[REG2]], [r0]
-; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
 ; CHECK-T1-LABEL: t2:
 ; CHECK-T1: bl _memcpy
  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i1 false)