From 10a3d404573212c9690d63c1ecddfbf70b8e2f9e Mon Sep 17 00:00:00 2001
From: maekawatoshiki <konndennsa@gmail.com>
Date: Tue, 20 Jul 2021 00:31:18 +0900
Subject: [PATCH] [LICM] Create LoopNest Invariant Code Motion (LNICM) pass

This patch adds a new pass called LNICM which is a LoopNest version of LICM and a test case to show how LNICM works.
Basically, LNICM only hoists invariants out of loop nest (not a loop) to keep/make perfect loop nest. This enables later optimizations that require perfect loop nest.

Reviewed By: Whitney

Differential Revision: https://reviews.llvm.org/D104180
---
 include/llvm/Transforms/Scalar/LICM.h     |  16 ++++
 include/llvm/Transforms/Utils/LoopUtils.h |   2 +-
 lib/Passes/PassRegistry.def               |   1 +
 lib/Transforms/Scalar/LICM.cpp            |  38 ++++++--
 test/Transforms/LICM/lnicm.ll             | 103 ++++++++++++++++++++++
 5 files changed, 154 insertions(+), 6 deletions(-)
 create mode 100644 test/Transforms/LICM/lnicm.ll
diff --git a/include/llvm/Transforms/Scalar/LICM.h b/include/llvm/Transforms/Scalar/LICM.h
index a8f1c534861..751f75c0ccb 100644
--- a/include/llvm/Transforms/Scalar/LICM.h
+++ b/include/llvm/Transforms/Scalar/LICM.h
@@ -57,6 +57,22 @@ public:
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
+
+/// Performs LoopNest Invariant Code Motion Pass.
+class LNICMPass : public PassInfoMixin<LNICMPass> {
+  unsigned LicmMssaOptCap;
+  unsigned LicmMssaNoAccForPromotionCap;
+
+public:
+  LNICMPass()
+      : LicmMssaOptCap(SetLicmMssaOptCap),
+        LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {}
+  LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap)
+      : LicmMssaOptCap(LicmMssaOptCap),
+        LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+  PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM,
+                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_SCALAR_LICM_H
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index ccd953b72bd..247b911b7c8 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -165,7 +165,7 @@ bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                  BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
                  AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *,
                  ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
-                 OptimizationRemarkEmitter *);
+                 OptimizationRemarkEmitter *, bool);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 6316765b6f1..e0b2beef38c 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -415,6 +415,7 @@ LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
 LOOP_PASS("dot-ddg", DDGDotPrinterPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 LOOP_PASS("licm", LICMPass())
+LOOP_PASS("lnicm", LNICMPass())
 LOOP_PASS("loop-flatten", LoopFlattenPass())
 LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
 LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass())
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 22ed6ae6c53..e4bb0793c89 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -196,7 +196,7 @@ struct LoopInvariantCodeMotion {
   bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
                  BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI,
                  TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA,
-                 OptimizationRemarkEmitter *ORE);
+                 OptimizationRemarkEmitter *ORE, bool LoopNestMode = false);
 
   LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
                           unsigned LicmMssaNoAccForPromotionCap)
@@ -295,6 +295,33 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   return PA;
 }
 
+PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
+                                 LoopStandardAnalysisResults &AR,
+                                 LPMUpdater &) {
+  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(LN.getParent());
+
+  LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+
+  Loop &OutermostLoop = LN.getOutermostLoop();
+  bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, AR.BFI,
+                                &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE, true);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  auto PA = getLoopPassPreservedAnalyses();
+
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+
+  return PA;
+}
+
 char LegacyLICMPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
                       false, false)
@@ -347,7 +374,8 @@ llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(
 bool LoopInvariantCodeMotion::runOnLoop(
     Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
     BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-    ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
+    ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE,
+    bool LoopNestMode) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -414,7 +442,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
                            CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
-                           *Flags.get(), ORE);
+                           *Flags.get(), ORE, LoopNestMode);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -859,7 +887,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
                        ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
-                       OptimizationRemarkEmitter *ORE) {
+                       OptimizationRemarkEmitter *ORE, bool LoopNestMode) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
          CurLoop != nullptr && SafetyInfo != nullptr &&
@@ -882,7 +910,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
   for (BasicBlock *BB : Worklist) {
     // Only need to process the contents of this block if it is not part of a
     // subloop (which would already have been processed).
-    if (inSubLoop(BB, CurLoop, LI))
+    if (!LoopNestMode && inSubLoop(BB, CurLoop, LI))
       continue;
 
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
diff --git a/test/Transforms/LICM/lnicm.ll b/test/Transforms/LICM/lnicm.ll
new file mode 100644
index 00000000000..a5c301df7c7
--- /dev/null
+++ b/test/Transforms/LICM/lnicm.ll
@@ -0,0 +1,103 @@
+; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)'       -S %s | FileCheck %s --check-prefixes INTC
+; RUN: opt -aa-pipeline=basic-aa -passes='loop(lnicm,loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK
+; RUN: opt -aa-pipeline=basic-aa -passes='loop(licm,loop-interchange)'  -S %s | FileCheck %s --check-prefixes LICM,CHECK
+
+; This test represents the following function:
+; void test(int x[10][10], int y[10], int *z) {
+;   for (int k = 0; k < 10; k++) {
+;     int tmp = *z;
+;     for (int i = 0; i < 10; i++)
+;       x[i][k] += y[k] + tmp;
+;   }
+; }
+; We only want to hoist the load of z out of the loop nest.
+; LICM hoists the load of y[k] out of the i-loop, but LNICM doesn't do so
+; to keep perfect loop nest. This enables optimizations that require
+; perfect loop nest (e.g. loop-interchange) to perform.
+
+
+define dso_local void @test([10 x i32]* noalias %x, i32* noalias readonly %y, i32* readonly %z) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   [[Z:%.*]] = load i32, i32* %z, align 4
+; CHECK-NEXT:   br label [[FOR_BODY3_PREHEADER:%.*]]
+; LNICM:      for.body.preheader:
+; LICM-NOT:   for.body.preheader:
+; INTC-NOT:   for.body.preheader:
+; LNICM-NEXT:   br label [[FOR_BODY:%.*]]
+; CHECK:      for.body:
+; LNICM-NEXT:   [[K:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; LNICM-NEXT:   br label [[FOR_BODY3_SPLIT1:%.*]]
+; LICM:         [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4
+; LNICM:      for.body3.preheader:
+; LICM-NOT:   for.body3.preheader:
+; INTC-NOT:   for.body3.preheader:
+; LNICM-NEXT:   br label [[FOR_BODY3:%.*]]
+; CHECK:      for.body3:
+; LNICM-NEXT:   [[I:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER:%.*]] ]
+; LNICM-NEXT:   br label [[FOR_BODY_PREHEADER:%.*]]
+; LNICM:      for.body3.split1:
+; LNICM-NEXT:   [[IDXPROM:%.*]] = sext i32 [[K:%.*]] to i64
+; LNICM-NEXT:   [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %y, i64 [[IDXPROM:%.*]]
+; LNICM-NEXT:   [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4
+; LNICM-NEXT:   [[ADD:%.*]] = add nsw i32 [[TMP:%.*]], [[Z:%.*]]
+; LNICM-NEXT:   [[IDXPROM4:%.*]] = sext i32 [[I:%.*]] to i64
+; LNICM-NEXT:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 [[IDXPROM4:%.*]]
+; LNICM-NEXT:   [[IDXPROM6:%.*]] = sext i32 [[K:%.*]] to i64
+; LNICM-NEXT:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5:%.*]], i64 0, i64 [[IDXPROM6:%.*]]
+; LNICM-NEXT:   [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX7:%.*]], align 4
+; LNICM-NEXT:   [[ADD8:%.*]] = add nsw i32 [[TMP2:%.*]], [[ADD:%.*]]
+; LNICM-NEXT:   store i32 [[ADD8:%.*]], i32* [[ARRAYIDX7:%.*]], align 4
+; LNICM-NEXT:   [[INC:%.*]] = add nsw i32 [[I:%.*]], 1
+; LNICM-NEXT:   [[CMP2:%.*]] = icmp slt i32 [[INC:%.*]], 10
+; LNICM-NEXT:   br label [[FOR_END:%.*]]
+; LNICM:      for.body3.split:
+; LICM-NOT:   for.body3.split:
+; INTC-NOT:   for.body3.split:
+; LNICM-NEXT:   [[TMP3:%.*]] = add nsw i32 [[I:%.*]], 1
+; LNICM-NEXT:   [[TMP4:%.*]] = icmp slt i32 [[TMP3:%.*]], 10
+; LNICM-NEXT:   br i1 [[TMP4:%.*]], label [[FOR_BODY3:%.*]], label [[FOR_END11:%.*]], !llvm.loop !0
+; LNICM:      for.end:
+; LNICM-NEXT:   [[INC10:%.*]] = add nsw i32 [[K:%.*]], 1
+; LNICM-NEXT:   [[CMP:%.*]] = icmp slt i32 [[INC10:%.*]], 10
+; LNICM-NEXT:   br i1 [[CMP:%.*]], label [[FOR_BODY:%.*]], label [[FOR_BODY3_SPLIT:%.*]], !llvm.loop !2
+; LNICM:      for.end11:
+; LNICM-NEXT:   ret void
+
+entry:
+  br label %for.body
+
+for.body:
+  %k.02 = phi i32 [ 0, %entry ], [ %inc10, %for.end ]
+  %0 = load i32, i32* %z, align 4
+  br label %for.body3
+
+for.body3:
+  %i.01 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ]
+  %idxprom = sext i32 %k.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %1, %0
+  %idxprom4 = sext i32 %i.01 to i64
+  %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 %idxprom4
+  %idxprom6 = sext i32 %k.02 to i64
+  %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %arrayidx5, i64 0, i64 %idxprom6
+  %2 = load i32, i32* %arrayidx7, align 4
+  %add8 = add nsw i32 %2, %add
+  store i32 %add8, i32* %arrayidx7, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp2 = icmp slt i32 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.end, !llvm.loop !0
+
+for.end:
+  %inc10 = add nsw i32 %k.02, 1
+  %cmp = icmp slt i32 %inc10, 10
+  br i1 %cmp, label %for.body, label %for.end11, !llvm.loop !2
+
+for.end11:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = distinct !{!2, !1}