From 10a3d404573212c9690d63c1ecddfbf70b8e2f9e Mon Sep 17 00:00:00 2001 From: maekawatoshiki Date: Tue, 20 Jul 2021 00:31:18 +0900 Subject: [PATCH] [LICM] Create LoopNest Invariant Code Motion (LNICM) pass This patch adds a new pass called LNICM which is a LoopNest version of LICM and a test case to show how LNICM works. Basically, LNICM only hoists invariants out of loop nest (not a loop) to keep/make perfect loop nest. This enables later optimizations that require perfect loop nest. Reviewed By: Whitney Differential Revision: https://reviews.llvm.org/D104180 --- include/llvm/Transforms/Scalar/LICM.h | 16 ++++ include/llvm/Transforms/Utils/LoopUtils.h | 2 +- lib/Passes/PassRegistry.def | 1 + lib/Transforms/Scalar/LICM.cpp | 38 ++++++-- test/Transforms/LICM/lnicm.ll | 103 ++++++++++++++++++++++ 5 files changed, 154 insertions(+), 6 deletions(-) create mode 100644 test/Transforms/LICM/lnicm.ll diff --git a/include/llvm/Transforms/Scalar/LICM.h b/include/llvm/Transforms/Scalar/LICM.h index a8f1c534861..751f75c0ccb 100644 --- a/include/llvm/Transforms/Scalar/LICM.h +++ b/include/llvm/Transforms/Scalar/LICM.h @@ -57,6 +57,22 @@ public: PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); }; + +/// Performs LoopNest Invariant Code Motion Pass. +class LNICMPass : public PassInfoMixin { + unsigned LicmMssaOptCap; + unsigned LicmMssaNoAccForPromotionCap; + +public: + LNICMPass() + : LicmMssaOptCap(SetLicmMssaOptCap), + LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {} + LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap) + : LicmMssaOptCap(LicmMssaOptCap), + LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {} + PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; } // end namespace llvm #endif // LLVM_TRANSFORMS_SCALAR_LICM_H diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h index ccd953b72bd..247b911b7c8 100644 --- a/include/llvm/Transforms/Utils/LoopUtils.h +++ b/include/llvm/Transforms/Utils/LoopUtils.h @@ -165,7 +165,7 @@ bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, BlockFrequencyInfo *, TargetLibraryInfo *, Loop *, AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, - OptimizationRemarkEmitter *); + OptimizationRemarkEmitter *, bool); /// This function deletes dead loops. The caller of this function needs to /// guarantee that the loop is infact dead. diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 6316765b6f1..e0b2beef38c 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -415,6 +415,7 @@ LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass()) LOOP_PASS("dot-ddg", DDGDotPrinterPass()) LOOP_PASS("invalidate", InvalidateAllAnalysesPass()) LOOP_PASS("licm", LICMPass()) +LOOP_PASS("lnicm", LNICMPass()) LOOP_PASS("loop-flatten", LoopFlattenPass()) LOOP_PASS("loop-idiom", LoopIdiomRecognizePass()) LOOP_PASS("loop-instsimplify", LoopInstSimplifyPass()) diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 22ed6ae6c53..e4bb0793c89 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -196,7 +196,7 @@ struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, - OptimizationRemarkEmitter *ORE); + OptimizationRemarkEmitter *ORE, bool LoopNestMode = false); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap) @@ -295,6 +295,33 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, return PA; } +PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis + // pass. Function analyses need to be preserved across loop transformations + // but ORE cannot be preserved (see comment before the pass definition). + OptimizationRemarkEmitter ORE(LN.getParent()); + + LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + + Loop &OutermostLoop = LN.getOutermostLoop(); + bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, AR.BFI, + &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE, true); + + if (!Changed) + return PreservedAnalyses::all(); + + auto PA = getLoopPassPreservedAnalyses(); + + PA.preserve(); + PA.preserve(); + if (AR.MSSA) + PA.preserve(); + + return PA; +} + char LegacyLICMPass::ID = 0; INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false, false) @@ -347,7 +374,8 @@ llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags( bool LoopInvariantCodeMotion::runOnLoop( Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, - ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) { + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, + bool LoopNestMode) { bool Changed = false; assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); @@ -414,7 +442,7 @@ bool LoopInvariantCodeMotion::runOnLoop( if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, CurAST.get(), MSSAU.get(), SE, &SafetyInfo, - *Flags.get(), ORE); + *Flags.get(), ORE, LoopNestMode); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -859,7 +887,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, - OptimizationRemarkEmitter *ORE) { + OptimizationRemarkEmitter *ORE, bool LoopNestMode) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -882,7 +910,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, for (BasicBlock *BB : Worklist) { // Only need to process the contents of this block if it is not part of a // subloop (which would already have been processed). - if (inSubLoop(BB, CurLoop, LI)) + if (!LoopNestMode && inSubLoop(BB, CurLoop, LI)) continue; for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) { diff --git a/test/Transforms/LICM/lnicm.ll b/test/Transforms/LICM/lnicm.ll new file mode 100644 index 00000000000..a5c301df7c7 --- /dev/null +++ b/test/Transforms/LICM/lnicm.ll @@ -0,0 +1,103 @@ +; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes INTC +; RUN: opt -aa-pipeline=basic-aa -passes='loop(lnicm,loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK +; RUN: opt -aa-pipeline=basic-aa -passes='loop(licm,loop-interchange)' -S %s | FileCheck %s --check-prefixes LICM,CHECK + +; This test represents the following function: +; void test(int x[10][10], int y[10], int *z) { +; for (int k = 0; k < 10; k++) { +; int tmp = *z; +; for (int i = 0; i < 10; i++) +; x[i][k] += y[k] + tmp; +; } +; } +; We only want to hoist the load of z out of the loop nest. +; LICM hoists the load of y[k] out of the i-loop, but LNICM doesn't do so +; to keep perfect loop nest. This enables optimizations that require +; perfect loop nest (e.g. loop-interchange) to perform. + + +define dso_local void @test([10 x i32]* noalias %x, i32* noalias readonly %y, i32* readonly %z) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[Z:%.*]] = load i32, i32* %z, align 4 +; CHECK-NEXT: br label [[FOR_BODY3_PREHEADER:%.*]] +; LNICM: for.body.preheader: +; LICM-NOT: for.body.preheader: +; INTC-NOT: for.body.preheader: +; LNICM-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; LNICM-NEXT: [[K:%.*]] = phi i32 [ [[INC10:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; LNICM-NEXT: br label [[FOR_BODY3_SPLIT1:%.*]] +; LICM: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4 +; LNICM: for.body3.preheader: +; LICM-NOT: for.body3.preheader: +; INTC-NOT: for.body3.preheader: +; LNICM-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; LNICM-NEXT: [[I:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 0, [[FOR_BODY3_PREHEADER:%.*]] ] +; LNICM-NEXT: br label [[FOR_BODY_PREHEADER:%.*]] +; LNICM: for.body3.split1: +; LNICM-NEXT: [[IDXPROM:%.*]] = sext i32 [[K:%.*]] to i64 +; LNICM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* %y, i64 [[IDXPROM:%.*]] +; LNICM-NEXT: [[TMP:%.*]] = load i32, i32* [[ARRAYIDX:%.*]], align 4 +; LNICM-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP:%.*]], [[Z:%.*]] +; LNICM-NEXT: [[IDXPROM4:%.*]] = sext i32 [[I:%.*]] to i64 +; LNICM-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 [[IDXPROM4:%.*]] +; LNICM-NEXT: [[IDXPROM6:%.*]] = sext i32 [[K:%.*]] to i64 +; LNICM-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ARRAYIDX5:%.*]], i64 0, i64 [[IDXPROM6:%.*]] +; LNICM-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX7:%.*]], align 4 +; LNICM-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP2:%.*]], [[ADD:%.*]] +; LNICM-NEXT: store i32 [[ADD8:%.*]], i32* [[ARRAYIDX7:%.*]], align 4 +; LNICM-NEXT: [[INC:%.*]] = add nsw i32 [[I:%.*]], 1 +; LNICM-NEXT: [[CMP2:%.*]] = icmp slt i32 [[INC:%.*]], 10 +; LNICM-NEXT: br label [[FOR_END:%.*]] +; LNICM: for.body3.split: +; LICM-NOT: for.body3.split: +; INTC-NOT: for.body3.split: +; LNICM-NEXT: [[TMP3:%.*]] = add nsw i32 [[I:%.*]], 1 +; LNICM-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3:%.*]], 10 +; LNICM-NEXT: br i1 [[TMP4:%.*]], label [[FOR_BODY3:%.*]], label [[FOR_END11:%.*]], !llvm.loop !0 +; LNICM: for.end: +; LNICM-NEXT: [[INC10:%.*]] = add nsw i32 [[K:%.*]], 1 +; LNICM-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC10:%.*]], 10 +; LNICM-NEXT: br i1 [[CMP:%.*]], label [[FOR_BODY:%.*]], label [[FOR_BODY3_SPLIT:%.*]], !llvm.loop !2 +; LNICM: for.end11: +; LNICM-NEXT: ret void + +entry: + br label %for.body + +for.body: + %k.02 = phi i32 [ 0, %entry ], [ %inc10, %for.end ] + %0 = load i32, i32* %z, align 4 + br label %for.body3 + +for.body3: + %i.01 = phi i32 [ 0, %for.body ], [ %inc, %for.body3 ] + %idxprom = sext i32 %k.02 to i64 + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom + %1 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %1, %0 + %idxprom4 = sext i32 %i.01 to i64 + %arrayidx5 = getelementptr inbounds [10 x i32], [10 x i32]* %x, i64 %idxprom4 + %idxprom6 = sext i32 %k.02 to i64 + %arrayidx7 = getelementptr inbounds [10 x i32], [10 x i32]* %arrayidx5, i64 0, i64 %idxprom6 + %2 = load i32, i32* %arrayidx7, align 4 + %add8 = add nsw i32 %2, %add + store i32 %add8, i32* %arrayidx7, align 4 + %inc = add nsw i32 %i.01, 1 + %cmp2 = icmp slt i32 %inc, 10 + br i1 %cmp2, label %for.body3, label %for.end, !llvm.loop !0 + +for.end: + %inc10 = add nsw i32 %k.02, 1 + %cmp = icmp slt i32 %inc10, 10 + br i1 %cmp, label %for.body, label %for.end11, !llvm.loop !2 + +for.end11: + ret void +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.mustprogress"} +!2 = distinct !{!2, !1}