From 62a274f0e9fda3834d3ecd9dfec4a16fdaa79269 Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet@apple.com>
Date: Fri, 17 Jun 2016 22:35:41 +0000
Subject: [PATCH] [LAA] Enable symbolic stride speculation for all LAA clients

This is a functional change for LLE and LDist.  The other clients (LV,
LVerLICM) already had this explicitly enabled.

The temporary boolean parameter to LAA is removed that allowed turning
off speculation of symbolic strides.  This makes LAA's caching interface
LAA::getInfo only take the loop as the parameter.  This makes the
interface more friendly to the new Pass Manager.

The flag -enable-mem-access-versioning is moved from LV to a LAA which
now allows turning off speculation globally.

llvm-svn: 273064
---
 include/llvm/Analysis/LoopAccessAnalysis.h    | 13 +---
 lib/Analysis/LoopAccessAnalysis.cpp           | 38 +++++-----
 lib/Transforms/Scalar/LoopVersioningLICM.cpp  |  2 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 17 +----
 .../LoopDistribute/symbolic-stride.ll         | 65 +++++++++++++++++
 .../LoopLoadElim/symbolic-stride.ll           | 69 +++++++++++++++++--
 6 files changed, 154 insertions(+), 50 deletions(-)
 create mode 100644 test/Transforms/LoopDistribute/symbolic-stride.ll
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index dba0b362fd8..79110a874d6 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -513,8 +513,7 @@ class LoopAccessInfo {
 public:
   LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL,
                  const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                 DominatorTree *DT, LoopInfo *LI,
-                 bool SpeculateSymbolicStrides);
+                 DominatorTree *DT, LoopInfo *LI);
 
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles.
@@ -585,11 +584,6 @@ public:
   /// \brief Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// \brief Used to ensure that if the analysis was run with speculating the
-  /// value of symbolic strides, the client queries it with the same assumption.
-  /// Only used in DEBUG build but we don't want NDEBUG-dependent ABI.
-  bool SpeculateSymbolicStrides;
-
   /// \brief Checks existence of store to invariant address inside loop.
   /// If the loop has any store to invariant address, then it returns true,
   /// else returns false.
@@ -715,11 +709,8 @@ public:
 
   /// \brief Query the result of the loop access information for the loop \p L.
   ///
-  /// \p SpeculateSymbolicStrides enables symbolic value speculation.  The
-  /// corresponding run-time checks are collected in LAI::PSE.
-  ///
   /// If there is no cached result available run the analysis.
-  const LoopAccessInfo &getInfo(Loop *L, bool SpeculateSymbolicStrides = false);
+  const LoopAccessInfo &getInfo(Loop *L);
 
   void releaseMemory() override {
     // Invalidate the cache when the pass is freed.
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 69e7ce49cb6..44208408b0f 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -65,6 +65,21 @@ static cl::opt<unsigned>
                             "loop-access analysis (default = 100)"),
                    cl::init(100));
 
+/// This enables versioning on the strides of symbolically striding memory
+/// accesses in code like the following.
+///   for (i = 0; i < N; ++i)
+///     A[i * Stride1] += B[i * Stride2] ...
+///
+/// Will be roughly translated to
+///    if (Stride1 == 1 && Stride2 == 1) {
+///      for (i = 0; i < N; i+=4)
+///       A[i:i+3] += ...
+///    } else
+///      ...
+static cl::opt<bool> EnableMemAccessVersioning(
+    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
+    cl::desc("Enable symbolic stride memory access versioning"));
+
 /// \brief Enable store-to-load forwarding conflict detection. This option can
 /// be disabled for correctness testing.
 static cl::opt<bool> EnableForwardingConflictDetection(
@@ -1540,7 +1555,7 @@ void LoopAccessInfo::analyzeLoop() {
         NumLoads++;
         Loads.push_back(Ld);
         DepChecker.addAccess(Ld);
-        if (SpeculateSymbolicStrides)
+        if (EnableMemAccessVersioning)
           collectStridedAccess(Ld);
         continue;
       }
@@ -1564,7 +1579,7 @@ void LoopAccessInfo::analyzeLoop() {
         NumStores++;
         Stores.push_back(St);
         DepChecker.addAccess(St);
-        if (SpeculateSymbolicStrides)
+        if (EnableMemAccessVersioning)
           collectStridedAccess(St);
       }
     } // Next instr.
@@ -1904,11 +1919,9 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const DataLayout &DL,
                                const TargetLibraryInfo *TLI, AliasAnalysis *AA,
-                               DominatorTree *DT, LoopInfo *LI,
-                               bool SpeculateSymbolicStrides)
-    : SpeculateSymbolicStrides(SpeculateSymbolicStrides), PSE(*SE, *L),
-      PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL), TLI(TLI),
-      AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
+                               DominatorTree *DT, LoopInfo *LI)
+    : PSE(*SE, *L), PtrRtChecking(SE), DepChecker(PSE, L), TheLoop(L), DL(DL),
+      TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0),
       MaxSafeDepDistBytes(-1U), CanVecMem(false),
       StoreToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
@@ -1955,19 +1968,12 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   PSE.print(OS, Depth);
 }
 
-const LoopAccessInfo &
-LoopAccessAnalysis::getInfo(Loop *L, bool SpeculateSymbolicStrides) {
+const LoopAccessInfo &LoopAccessAnalysis::getInfo(Loop *L) {
   auto &LAI = LoopAccessInfoMap[L];
 
-#ifndef NDEBUG
-  assert((!LAI || LAI->SpeculateSymbolicStrides == SpeculateSymbolicStrides) &&
-         "Symbolic strides changed for loop");
-#endif
-
   if (!LAI) {
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI,
-                                            SpeculateSymbolicStrides);
+    LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, LI);
   }
   return *LAI.get();
 }
diff --git a/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 3ac01a75892..b88ca7e717d 100644
--- a/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -385,7 +385,7 @@ bool LoopVersioningLICM::legalLoopInstructions() {
         return false;
     }
   // Get LoopAccessInfo from current loop.
-  LAI = &LAA->getInfo(CurLoop, true);
+  LAI = &LAA->getInfo(CurLoop);
   // Check LoopAccessInfo for need of runtime check.
   if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
     DEBUG(dbgs() << "    LAA: Runtime check not found !!\n");
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0c4605ea4b3..954b5938cf5 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -130,21 +130,6 @@ static cl::opt<bool> MaximizeBandwidth(
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
              "will be determined by the smallest type in loop."));
 
-/// This enables versioning on the strides of symbolically striding memory
-/// accesses in code like the following.
-///   for (i = 0; i < N; ++i)
-///     A[i * Stride1] += B[i * Stride2] ...
-///
-/// Will be roughly translated to
-///    if (Stride1 == 1 && Stride2 == 1) {
-///      for (i = 0; i < N; i+=4)
-///       A[i:i+3] += ...
-///    } else
-///      ...
-static cl::opt<bool> EnableMemAccessVersioning(
-    "enable-mem-access-versioning", cl::init(true), cl::Hidden,
-    cl::desc("Enable symbolic stride memory access versioning"));
-
 static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
@@ -4970,7 +4955,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
-  LAI = &LAA->getInfo(TheLoop, EnableMemAccessVersioning);
+  LAI = &LAA->getInfo(TheLoop);
   auto &OptionalReport = LAI->getReport();
   if (OptionalReport)
     emitAnalysis(VectorizationReport(*OptionalReport));
diff --git a/test/Transforms/LoopDistribute/symbolic-stride.ll b/test/Transforms/LoopDistribute/symbolic-stride.ll
new file mode 100644
index 00000000000..73d3d19c5dd
--- /dev/null
+++ b/test/Transforms/LoopDistribute/symbolic-stride.ll
@@ -0,0 +1,65 @@
+; RUN: opt -basicaa -loop-distribute -S < %s | \
+; RUN:     FileCheck %s --check-prefix=ALL --check-prefix=STRIDE_SPEC
+
+; RUN: opt -basicaa -loop-distribute -S -enable-mem-access-versioning=0 < %s | \
+; RUN:     FileCheck %s --check-prefix=ALL --check-prefix=NO_STRIDE_SPEC
+
+; If we don't speculate stride for 1 we can't distribute along the line
+; because we could have a backward dependence:
+;
+;   for (i = 0; i < n; i++) {
+;     A[i + 1] = A[i] * B[i];
+;     =======================
+;     C[i] = D[i] * A[stride * i];
+;   }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; ALL-LABEL: @f(
+define void @f(i32* noalias %a,
+               i32* noalias %b,
+               i32* noalias %c,
+               i32* noalias %d,
+               i64 %stride) {
+entry:
+  br label %for.body
+
+; STRIDE_SPEC: %ident.check = icmp ne i64 %stride, 1
+
+; STRIDE_SPEC: for.body.ldist1:
+; NO_STRIDE_SPEC-NOT: for.body.ldist1:
+
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %add, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, i32* %a, i64 %add
+  store i32 %mulA, i32* %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, i32* %d, i64 %ind
+  %loadD = load i32, i32* %arrayidxD, align 4
+
+  %mul = mul i64 %ind, %stride
+  %arrayidxStridedA = getelementptr inbounds i32, i32* %a, i64 %mul
+  %loadStridedA = load i32, i32* %arrayidxStridedA, align 4
+
+  %mulC = mul i32 %loadD, %loadStridedA
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+  store i32 %mulC, i32* %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopLoadElim/symbolic-stride.ll b/test/Transforms/LoopLoadElim/symbolic-stride.ll
index 3e33b3fc173..7a2d1b6c7e3 100644
--- a/test/Transforms/LoopLoadElim/symbolic-stride.ll
+++ b/test/Transforms/LoopLoadElim/symbolic-stride.ll
@@ -1,28 +1,44 @@
-; RUN: opt -loop-load-elim -S < %s | FileCheck %s
+; RUN: opt -loop-load-elim -S < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=ONE_STRIDE_SPEC \
+; RUN:                  -check-prefix=TWO_STRIDE_SPEC
 
-; Forwarding in the presence of symbolic strides is currently not supported:
+; RUN: opt -loop-load-elim -S -enable-mem-access-versioning=0 < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=NO_ONE_STRIDE_SPEC \
+; RUN:                  -check-prefix=NO_TWO_STRIDE_SPEC
+
+; RUN: opt -loop-load-elim -S -loop-load-elimination-scev-check-threshold=1 < %s | \
+; RUN:     FileCheck %s -check-prefix=ALL -check-prefix=ONE_STRIDE_SPEC \
+; RUN:                  -check-prefix=NO_TWO_STRIDE_SPEC
+
+; Forwarding in the presence of symbolic strides:
 ;
 ;   for (unsigned i = 0; i < 100; i++)
 ;     A[i + 1] = A[Stride * i] + B[i];
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
-; CHECK-LABEL: @f(
+; ALL-LABEL: @f(
 define void @f(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N,
                i64 %stride) {
+
+; ONE_STRIDE_SPEC: %ident.check = icmp ne i64 %stride, 1
+
 entry:
-; CHECK-NOT: %load_initial = load i32, i32* %A
+; NO_ONE_STRIDE_SPEC-NOT: %load_initial = load i32, i32* %A
+; ONE_STRIDE_SPEC: %load_initial = load i32, i32* %A
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
-; CHECK-NOT: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+; NO_ONE_STRIDE_SPEC-NOT: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+; ONE_STRIDE_SPEC: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %mul = mul i64 %indvars.iv, %stride
   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul
   %load = load i32, i32* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
   %load_1 = load i32, i32* %arrayidx2, align 4
-; CHECK-NOT: %add = add i32 %load_1, %store_forwarded
+; NO_ONE_STRIDE_SPEC-NOT: %add = add i32 %load_1, %store_forwarded
+; ONE_STRIDE_SPEC: %add = add i32 %load_1, %store_forwarded
   %add = add i32 %load_1, %load
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
@@ -33,3 +49,44 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+; With two symbolic strides:
+;
+;   for (unsigned i = 0; i < 100; i++)
+;     A[Stride2 * (i + 1)] = A[Stride1 * i] + B[i];
+
+; ALL-LABEL: @two_strides(
+define void @two_strides(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i64 %N,
+                         i64 %stride.1, i64 %stride.2) {
+
+; TWO_STRIDE_SPEC: %ident.check = icmp ne i64 %stride.2, 1
+; TWO_STRIDE_SPEC: %ident.check1 = icmp ne i64 %stride.1, 1
+; NO_TWO_STRIDE_SPEC-NOT: %ident.check{{.*}} = icmp ne i64 %stride{{.*}}, 1
+
+entry:
+; NO_TWO_STRIDE_SPEC-NOT: %load_initial = load i32, i32* %A
+; TWO_STRIDE_SPEC: %load_initial = load i32, i32* %A
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; NO_TWO_STRIDE_SPEC-NOT: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+; TWO_STRIDE_SPEC: %store_forwarded = phi i32 [ %load_initial, {{.*}} ], [ %add, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %mul = mul i64 %indvars.iv, %stride.1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %mul
+  %load = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %load_1 = load i32, i32* %arrayidx2, align 4
+; NO_TWO_STRIDE_SPEC-NOT: %add = add i32 %load_1, %store_forwarded
+; TWO_STRIDE_SPEC: %add = add i32 %load_1, %store_forwarded
+  %add = add i32 %load_1, %load
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul.2 = mul i64 %indvars.iv.next, %stride.2
+  %arrayidx_next = getelementptr inbounds i32, i32* %A, i64 %mul.2
+  store i32 %add, i32* %arrayidx_next, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}