From 73703d12a457cc7e42307f8f548d2265efe1b1a2 Mon Sep 17 00:00:00 2001 From: Adam Nemet Date: Thu, 11 May 2017 17:06:17 +0000 Subject: [PATCH] [SLP] Emit optimization remarks The approach I followed was to emit the remark after getTreeCost concludes that SLP is profitable. I initially tried emitting them after the vectorizeRootInstruction calls in vectorizeChainsInBlock but I vaguely remember missing a few cases for example in HorizontalReduction::tryToReduce. ORE is placed in BoUpSLP so that it's available from everywhere (notably HorizontalReduction::tryToReduce). We use the first instruction in the root bundle as the locator for the remark. In order to get a sense how far the tree is spanning I've include the size of the tree in the remark. This is not perfect of course but it gives you at least a rough idea about the tree. Then you can follow up with -view-slp-tree to really see the actual tree. llvm-svn: 302811 --- .../llvm/Transforms/Vectorize/SLPVectorizer.h | 4 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 42 +++++++++++++++--- .../SLPVectorizer/AArch64/getelementptr.ll | 43 +++++++++++++++++-- .../SLPVectorizer/AArch64/horizontal.ll | 33 +++++++++++++- .../SLPVectorizer/AArch64/remarks.ll | 32 ++++++++++++++ 5 files changed, 143 insertions(+), 11 deletions(-) create mode 100644 test/Transforms/SLPVectorizer/AArch64/remarks.ll diff --git a/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 10338f7937e..c514db41623 100644 --- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -24,6 +24,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" @@ -59,7 +60,8 @@ public: // Glue for old PM. bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, - DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_); + DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_); private: /// \brief Collect store and getelementptr instructions and organize them diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 960019f41df..f6334eb1410 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -299,10 +299,10 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, - const DataLayout *DL) + const DataLayout *DL, OptimizationRemarkEmitter *ORE) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), - DL(DL), Builder(Se->getContext()) { + DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -361,6 +361,8 @@ public: MinBWs.clear(); } + unsigned getTreeSize() const { return VectorizableTree.size(); } + /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -399,6 +401,8 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + OptimizationRemarkEmitter *getORE() { return ORE; } + private: struct TreeEntry; @@ -928,6 +932,8 @@ private: AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; + OptimizationRemarkEmitter *ORE; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. @@ -3772,8 +3778,9 @@ struct SLPVectorizer : public FunctionPass { auto *DT = &getAnalysis().getDomTree(); auto *AC = &getAnalysis().getAssumptionCache(F); auto *DB = &getAnalysis().getDemandedBits(); + auto *ORE = &getAnalysis().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -3785,6 +3792,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); @@ -3803,8 +3811,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A auto *DT = &AM.getResult(F); auto *AC = &AM.getResult(F); auto *DB = &AM.getResult(F); + auto *ORE = &AM.getResult(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); if (!Changed) return PreservedAnalyses::all(); @@ -3819,7 +3828,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_) { + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_) { SE = SE_; TTI = TTI_; TLI = TLI_; @@ -3847,7 +3857,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. @@ -3936,6 +3946,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + using namespace ore; + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast(Chain[i])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); + R.vectorizeTree(); // Move to the next bundle. @@ -4149,6 +4166,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", + cast(Ops[0])) + << "SLP vectorized with cost " << ore::NV("Cost", Cost) + << " and with tree size " + << ore::NV("TreeSize", R.getTreeSize())); + Value *VectorizedRoot = R.vectorizeTree(); // Reconstruct the build vector by extracting the vectorized root. This @@ -4492,6 +4515,12 @@ public: DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); + auto *I0 = cast(VL[0]); + V.getORE()->emit( + OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize())); // Vectorize a tree. DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc(); @@ -5146,6 +5175,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { diff --git a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index e9b71963530..962a6c3b57b 100644 --- a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,4 +1,5 @@ -; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s +; RUN: cat %t | FileCheck -check-prefix=YAML %s target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -23,7 +24,25 @@ target triple = "aarch64--linux-gnu" ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]] ; CHECK: sext i32 [[X]] to i64 -; + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_4x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '11' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '5' + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_4x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '16' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { entry: %cmp31 = icmp sgt i32 %n, 0 @@ -69,7 +88,25 @@ for.body: ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]] ; CHECK: sext i32 [[X]] to i64 -; + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_2x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '11' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '5' + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: getelementptr_2x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '6' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { entry: %cmp31 = icmp sgt i32 %n, 0 diff --git a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll index 8f8bf2648aa..1a6a2fb890d 100644 --- a/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,4 +1,5 @@ -; RUN: opt -slp-vectorizer -slp-threshold=-6 -S < %s | FileCheck %s +; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s +; RUN: cat %t | FileCheck -check-prefix=YAML %s ; FIXME: The threshold is changed to keep this test case a bit smaller. ; The AArch64 cost model should not give such high costs to select statements. @@ -10,6 +11,16 @@ target triple = "aarch64--linux" ; CHECK: load <4 x i32> ; CHECK: load <4 x i32> ; CHECK: select <4 x i1> + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: test_select +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '4' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '8' + define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) { entry: %cmp.22 = icmp sgt i32 %h, 0 @@ -93,6 +104,16 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia ; CHECK: load <4 x i32> ; CHECK: load <4 x i32> ; CHECK: mul nsw <4 x i32> + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: reduction_with_br +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '1' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '3' + entry: %cmp.16 = icmp sgt i32 %h, 0 br i1 %cmp.16, label %for.body.lr.ph, label %for.end @@ -150,6 +171,16 @@ for.end: ; preds = %for.end.loopexit, % ; CHECK: load <8 x i8> ; CHECK: load <8 x i8> ; CHECK: select <8 x i1> + +; YAML: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: test_unrolled_select +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '-33' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '10' + define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 { entry: %cmp.43 = icmp sgt i32 %h, 0 diff --git a/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/test/Transforms/SLPVectorizer/AArch64/remarks.ll new file mode 100644 index 00000000000..e8c37512594 --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/remarks.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s + +define void @f(double* %r, double* %w) { + %r0 = getelementptr inbounds double, double* %r, i64 0 + %r1 = getelementptr inbounds double, double* %r, i64 1 + %f0 = load double, double* %r0 + %f1 = load double, double* %r1 + %add0 = fadd double %f0, %f0 + %add1 = fadd double %f1, %f1 + %w0 = getelementptr inbounds double, double* %w, i64 0 + %w1 = getelementptr inbounds double, double* %w, i64 1 +; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3 + store double %add0, double* %w0, !dbg !9 + store double %add1, double* %w1 + ret void +} + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2) +!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"PIC Level", i32 2} +!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"} +!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 5, column: 10, scope: !7)