2017-06-01 13:39:39 +02:00
|
|
|
; The IR below was crafted so as:
|
|
|
|
; 1) To have a loop, so we create a loop pass manager
|
|
|
|
; 2) To be "immutable" in the sense that no pass in the standard
|
|
|
|
; pipeline will modify it.
|
|
|
|
; Since no transformations take place, we don't expect any analyses
|
|
|
|
; to be invalidated.
|
|
|
|
; Any invalidation that shows up here is a bug, unless we started modifying
|
|
|
|
; the IR, in which case we need to make it immutable harder.
|
|
|
|
;
|
|
|
|
; Prelink pipelines:
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2020-11-16 21:48:42 +01:00
|
|
|
; RUN: -passes='thinlto-pre-link<O1>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2020-11-16 21:48:42 +01:00
|
|
|
; RUN: -passes='thinlto-pre-link<O2>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2020-11-16 21:48:42 +01:00
|
|
|
; RUN: -passes='thinlto-pre-link<O3>' -S -passes-ep-pipeline-start='no-op-module' %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-EP-PIPELINE-START
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2020-11-16 21:48:42 +01:00
|
|
|
; RUN: -passes='thinlto-pre-link<Os>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2020-11-16 21:48:42 +01:00
|
|
|
; RUN: -passes='thinlto-pre-link<Oz>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager -new-pm-debug-info-for-profiling \
|
2020-11-16 21:48:42 +01:00
|
|
|
; RUN: -passes='thinlto-pre-link<O2>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-PRELINK-O
|
2017-06-01 13:39:39 +02:00
|
|
|
;
|
|
|
|
; Postlink pipelines:
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2017-06-01 13:39:39 +02:00
|
|
|
; RUN: -passes='thinlto<O1>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,%llvmcheckext
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2017-06-01 13:39:39 +02:00
|
|
|
; RUN: -passes='thinlto<O2>' -S %s 2>&1 \
|
2019-06-08 17:37:47 +02:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \
|
2017-06-01 13:39:39 +02:00
|
|
|
; RUN: -passes='thinlto<O3>' -S %s 2>&1 \
|
2019-06-08 17:37:47 +02:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2017-06-01 13:39:39 +02:00
|
|
|
; RUN: -passes='thinlto<Os>' -S %s 2>&1 \
|
2019-06-08 17:37:47 +02:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-Os
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager \
|
2017-06-01 13:39:39 +02:00
|
|
|
; RUN: -passes='thinlto<Oz>' -S %s 2>&1 \
|
2020-10-30 05:54:45 +01:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext
|
2021-04-06 06:31:07 +02:00
|
|
|
; RUN: opt -disable-verify -verify-cfg-preserved=0 -debug-pass-manager -new-pm-debug-info-for-profiling \
|
2017-07-29 06:10:24 +02:00
|
|
|
; RUN: -passes='thinlto<O2>' -S %s 2>&1 \
|
2019-06-08 17:37:47 +02:00
|
|
|
; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2
|
2020-10-30 05:54:45 +01:00
|
|
|
|
|
|
|
; Suppress FileCheck --allow-unused-prefixes=false diagnostics.
|
|
|
|
; CHECK-NOEXT: {{^}}
|
|
|
|
|
2021-05-04 01:09:56 +02:00
|
|
|
; CHECK-O: Running pass: Annotation2Metadata
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
|
2018-01-23 02:25:20 +01:00
|
|
|
; CHECK-EP-PIPELINE-START-NEXT: Running pass: NoOpModulePass
|
2017-07-29 06:10:24 +02:00
|
|
|
; CHECK-DIS-NEXT: Running analysis: InnerAnalysisManagerProxy
|
2020-07-29 02:08:24 +02:00
|
|
|
; CHECK-DIS-NEXT: Running pass: AddDiscriminatorsPass
|
2017-08-10 07:10:32 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: PGOIndirectCallPromotion
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: ProfileSummaryAnalysis
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
|
Change TargetLibraryInfo analysis passes to always require Function
Summary:
This is the first change to enable the TLI to be built per-function so
that -fno-builtin* handling can be migrated to use function attributes.
See discussion on D61634 for background. This is an enabler for fixing
handling of these options for LTO, for example.
This change should not affect behavior, as the provided function is not
yet used to build a specifically per-function TLI, but rather enables
that migration.
Most of the changes were very mechanical, e.g. passing a Function to the
legacy analysis pass's getTLI interface, or in Module level cases,
adding a callback. This is similar to the way the per-function TTI
analysis works.
There was one place where we were looking for builtins but not in the
context of a specific function. See FindCXAAtExit in
lib/Transforms/IPO/GlobalOpt.cpp. I'm somewhat concerned my workaround
could provide the wrong behavior in some corner cases. Suggestions
welcome.
Reviewers: chandlerc, hfinkel
Subscribers: arsenm, dschuff, jvesely, nhaehnle, mehdi_amini, javed.absar, sbc100, jgravelle-google, eraman, aheejin, steven_wu, george.burgess.iv, dexonsmith, jfb, asbirlea, gchatelet, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D66428
llvm-svn: 371284
2019-09-07 05:09:36 +02:00
|
|
|
; CHECK-PRELINK-O-NODIS-NEXT: Running analysis: InnerAnalysisManagerProxy
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
|
2021-04-12 20:51:51 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
|
|
|
|
; CHECK-O-NEXT: Running analysis: TargetIRAnalysis
|
|
|
|
; CHECK-O-NEXT: Running analysis: AssumptionAnalysis
|
|
|
|
; CHECK-O-NEXT: Running pass: SROA
|
|
|
|
; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis
|
|
|
|
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
|
|
|
; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
|
2021-07-15 08:31:31 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: CoroEarlyPass
|
Recommit r317351 : Add CallSiteSplitting pass
This recommit r317351 after fixing a buildbot failure.
Original commit message:
Summary:
This change add a pass which tries to split a call-site to pass
more constrained arguments if its argument is predicated in the control flow
so that we can expose better context to the later passes (e.g, inliner, jump
threading, or IPA-CP based function cloning, etc.).
As of now we support two cases :
1) If a call site is dominated by an OR condition and if any of its arguments
are predicated on this OR condition, try to split the condition with more
constrained arguments. For example, in the code below, we try to split the
call site since we can predicate the argument (ptr) based on the OR condition.
Split from :
if (!ptr || c)
callee(ptr);
to :
if (!ptr)
callee(null ptr) // set the known constant value
else if (c)
callee(nonnull ptr) // set non-null attribute in the argument
2) We can also split a call-site based on constant incoming values of a PHI
For example,
from :
BB0:
%c = icmp eq i32 %i1, %i2
br i1 %c, label %BB2, label %BB1
BB1:
br label %BB2
BB2:
%p = phi i32 [ 0, %BB0 ], [ 1, %BB1 ]
call void @bar(i32 %p)
to
BB0:
%c = icmp eq i32 %i1, %i2
br i1 %c, label %BB2-split0, label %BB1
BB1:
br label %BB2-split1
BB2-split0:
call void @bar(i32 0)
br label %BB2
BB2-split1:
call void @bar(i32 1)
br label %BB2
BB2:
%p = phi i32 [ 0, %BB2-split0 ], [ 1, %BB2-split1 ]
llvm-svn: 317362
2017-11-03 21:41:16 +01:00
|
|
|
; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
|
2021-06-28 21:56:10 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: OpenMPOptPass
|
Restore "[WPD/LowerTypeTests] Delay lowering/removal of type tests until after ICP"
This restores commit 80d0a137a5aba6998fadb764f1e11cb901aae233, and the
follow on fix in 873c0d0786dcf22f4af39f65df824917f70f2170, with a new
fix for test failures after a 2-stage clang bootstrap, and a more robust
fix for the Chromium build failure that an earlier version partially
fixed. See also discussion on D75201.
Reviewers: evgeny777
Subscribers: mehdi_amini, Prazek, hiraditya, steven_wu, dexonsmith, arphaman, davidxl, cfe-commits, llvm-commits
Tags: #clang, #llvm
Differential Revision: https://reviews.llvm.org/D73242
2020-03-17 19:08:35 +01:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LowerTypeTestsPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: IPSCCPPass
|
2017-10-25 15:40:08 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: GlobalOptPass
|
2020-07-29 02:08:24 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: PromotePass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: DeadArgumentEliminationPass
|
|
|
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
2017-07-27 18:54:15 +02:00
|
|
|
; CHECK-PRELINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
|
2017-12-14 11:36:31 +01:00
|
|
|
; CHECK-O-NEXT: Running analysis: AAManager
|
2021-01-21 01:53:03 +01:00
|
|
|
; CHECK-O-NEXT: Running analysis: BasicAA
|
|
|
|
; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
|
|
|
|
; CHECK-O-NEXT: Running analysis: TypeBasedAA
|
2019-04-15 18:49:00 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
|
2020-04-28 22:25:15 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
|
|
|
|
; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
|
|
|
|
; CHECK-O-NEXT: Running analysis: GlobalsAA
|
|
|
|
; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
|
2021-04-27 17:56:11 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}AAManager
|
|
|
|
; CHECK-O-NEXT: Invalidating analysis: AAManager
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis
|
2017-08-10 07:10:32 +02:00
|
|
|
; CHECK-PRELINK-O-NEXT: Running analysis: ProfileSummaryAnalysis
|
[PassManager] Run additional LICM before LoopRotate
Loop rotation often has to perform code duplication
from header into preheader, which introduces PHI nodes.
>>! In D99204, @thopre wrote:
>
> With loop peeling, it is important that unnecessary PHIs be avoided or
> it will leads to spurious peeling. One source of such PHIs is loop
> rotation which creates PHIs for invariant loads. Those PHIs are
> particularly problematic since loop peeling is now run as part of simple
> loop unrolling before GVN is run, and are thus a source of spurious
> peeling.
>
> Note that while some of the load can be hoisted and eventually
> eliminated by instruction combine, this is not always possible due to
> alignment issue. In particular, the motivating example [1] was a load
> inside a class instance which cannot be hoisted because the `this'
> pointer has an alignment of 1.
>
> [1] http://lists.llvm.org/pipermail/llvm-dev/attachments/20210312/4ce73c47/attachment.cpp
Now, we could enhance LoopRotate to avoid duplicating code when not needed,
but instead hoist loop-invariant code, but isn't that a code duplication? (*sic*)
We have LICM, and in fact we already run it right after LoopRotation.
We could try to move it to before LoopRotation,
that is basically free from compile-time perspective:
https://llvm-compile-time-tracker.com/compare.php?from=6c93eb4477d88af046b915bc955c03693b2cbb58&to=a4bee6d07732b1184c436da489040b912f0dc271&stat=instructions
But, looking at stats, i think it isn't great that we would no longer do LICM after LoopRotation, in particular:
| statistic name | LoopRotate-LICM | LICM-LoopRotate | Δ | % | abs(%) |
| asm-printer.EmittedInsts | 9015930 | 9015799 | -131 | 0.00% | 0.00% |
| indvars.NumElimCmp | 3536 | 3544 | 8 | 0.23% | 0.23% |
| indvars.NumElimExt | 36725 | 36580 | -145 | -0.39% | 0.39% |
| indvars.NumElimIV | 1197 | 1187 | -10 | -0.84% | 0.84% |
| indvars.NumElimIdentity | 143 | 136 | -7 | -4.90% | 4.90% |
| indvars.NumElimRem | 4 | 5 | 1 | 25.00% | 25.00% |
| indvars.NumLFTR | 29842 | 29890 | 48 | 0.16% | 0.16% |
| indvars.NumReplaced | 2293 | 2227 | -66 | -2.88% | 2.88% |
| indvars.NumSimplifiedSDiv | 6 | 8 | 2 | 33.33% | 33.33% |
| indvars.NumWidened | 26438 | 26329 | -109 | -0.41% | 0.41% |
| instcount.TotalBlocks | 1178338 | 1173840 | -4498 | -0.38% | 0.38% |
| instcount.TotalFuncs | 111825 | 111829 | 4 | 0.00% | 0.00% |
| instcount.TotalInsts | 9905442 | 9896139 | -9303 | -0.09% | 0.09% |
| lcssa.NumLCSSA | 425871 | 423961 | -1910 | -0.45% | 0.45% |
| licm.NumHoisted | 378357 | 378753 | 396 | 0.10% | 0.10% |
| licm.NumMovedCalls | 2193 | 2208 | 15 | 0.68% | 0.68% |
| licm.NumMovedLoads | 35899 | 31821 | -4078 | -11.36% | 11.36% |
| licm.NumPromoted | 11178 | 11154 | -24 | -0.21% | 0.21% |
| licm.NumSunk | 13359 | 13587 | 228 | 1.71% | 1.71% |
| loop-delete.NumDeleted | 8547 | 8402 | -145 | -1.70% | 1.70% |
| loop-instsimplify.NumSimplified | 12876 | 11890 | -986 | -7.66% | 7.66% |
| loop-peel.NumPeeled | 1008 | 925 | -83 | -8.23% | 8.23% |
| loop-rotate.NumNotRotatedDueToHeaderSize | 368 | 365 | -3 | -0.82% | 0.82% |
| loop-rotate.NumRotated | 42015 | 42003 | -12 | -0.03% | 0.03% |
| loop-simplifycfg.NumLoopBlocksDeleted | 240 | 242 | 2 | 0.83% | 0.83% |
| loop-simplifycfg.NumLoopExitsDeleted | 497 | 20 | -477 | -95.98% | 95.98% |
| loop-simplifycfg.NumTerminatorsFolded | 618 | 336 | -282 | -45.63% | 45.63% |
| loop-unroll.NumCompletelyUnrolled | 11028 | 11032 | 4 | 0.04% | 0.04% |
| loop-unroll.NumUnrolled | 12608 | 12529 | -79 | -0.63% | 0.63% |
| mem2reg.NumDeadAlloca | 10222 | 10221 | -1 | -0.01% | 0.01% |
| mem2reg.NumPHIInsert | 192110 | 192106 | -4 | 0.00% | 0.00% |
| mem2reg.NumSingleStore | 637650 | 637643 | -7 | 0.00% | 0.00% |
| scalar-evolution.NumBruteForceTripCountsComputed | 814 | 812 | -2 | -0.25% | 0.25% |
| scalar-evolution.NumTripCountsComputed | 283108 | 282934 | -174 | -0.06% | 0.06% |
| scalar-evolution.NumTripCountsNotComputed | 106712 | 106718 | 6 | 0.01% | 0.01% |
| simple-loop-unswitch.NumBranches | 5178 | 4752 | -426 | -8.23% | 8.23% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 503 | -411 | -44.97% | 44.97% |
| simple-loop-unswitch.NumSwitches | 20 | 18 | -2 | -10.00% | 10.00% |
| simple-loop-unswitch.NumTrivial | 183 | 95 | -88 | -48.09% | 48.09% |
... but that actually regresses LICM (-12% `licm.NumMovedLoads`),
loop-simplifycfg (`NumLoopExitsDeleted`, `NumTerminatorsFolded`),
simple-loop-unswitch (`NumTrivial`).
What if we instead have LICM both before and after LoopRotate?
| statistic name | LoopRotate-LICM | LICM-LoopRotate-LICM | Δ | % | abs(%) |
| asm-printer.EmittedInsts | 9015930 | 9014474 | -1456 | -0.02% | 0.02% |
| indvars.NumElimCmp | 3536 | 3546 | 10 | 0.28% | 0.28% |
| indvars.NumElimExt | 36725 | 36681 | -44 | -0.12% | 0.12% |
| indvars.NumElimIV | 1197 | 1185 | -12 | -1.00% | 1.00% |
| indvars.NumElimIdentity | 143 | 146 | 3 | 2.10% | 2.10% |
| indvars.NumElimRem | 4 | 5 | 1 | 25.00% | 25.00% |
| indvars.NumLFTR | 29842 | 29899 | 57 | 0.19% | 0.19% |
| indvars.NumReplaced | 2293 | 2299 | 6 | 0.26% | 0.26% |
| indvars.NumSimplifiedSDiv | 6 | 8 | 2 | 33.33% | 33.33% |
| indvars.NumWidened | 26438 | 26404 | -34 | -0.13% | 0.13% |
| instcount.TotalBlocks | 1178338 | 1173652 | -4686 | -0.40% | 0.40% |
| instcount.TotalFuncs | 111825 | 111829 | 4 | 0.00% | 0.00% |
| instcount.TotalInsts | 9905442 | 9895452 | -9990 | -0.10% | 0.10% |
| lcssa.NumLCSSA | 425871 | 425373 | -498 | -0.12% | 0.12% |
| licm.NumHoisted | 378357 | 383352 | 4995 | 1.32% | 1.32% |
| licm.NumMovedCalls | 2193 | 2204 | 11 | 0.50% | 0.50% |
| licm.NumMovedLoads | 35899 | 35755 | -144 | -0.40% | 0.40% |
| licm.NumPromoted | 11178 | 11163 | -15 | -0.13% | 0.13% |
| licm.NumSunk | 13359 | 14321 | 962 | 7.20% | 7.20% |
| loop-delete.NumDeleted | 8547 | 8538 | -9 | -0.11% | 0.11% |
| loop-instsimplify.NumSimplified | 12876 | 12041 | -835 | -6.48% | 6.48% |
| loop-peel.NumPeeled | 1008 | 924 | -84 | -8.33% | 8.33% |
| loop-rotate.NumNotRotatedDueToHeaderSize | 368 | 365 | -3 | -0.82% | 0.82% |
| loop-rotate.NumRotated | 42015 | 42005 | -10 | -0.02% | 0.02% |
| loop-simplifycfg.NumLoopBlocksDeleted | 240 | 241 | 1 | 0.42% | 0.42% |
| loop-simplifycfg.NumTerminatorsFolded | 618 | 619 | 1 | 0.16% | 0.16% |
| loop-unroll.NumCompletelyUnrolled | 11028 | 11029 | 1 | 0.01% | 0.01% |
| loop-unroll.NumUnrolled | 12608 | 12525 | -83 | -0.66% | 0.66% |
| mem2reg.NumPHIInsert | 192110 | 192073 | -37 | -0.02% | 0.02% |
| mem2reg.NumSingleStore | 637650 | 637652 | 2 | 0.00% | 0.00% |
| scalar-evolution.NumTripCountsComputed | 283108 | 282998 | -110 | -0.04% | 0.04% |
| scalar-evolution.NumTripCountsNotComputed | 106712 | 106691 | -21 | -0.02% | 0.02% |
| simple-loop-unswitch.NumBranches | 5178 | 5185 | 7 | 0.14% | 0.14% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 925 | 11 | 1.20% | 1.20% |
| simple-loop-unswitch.NumTrivial | 183 | 179 | -4 | -2.19% | 2.19% |
| simple-loop-unswitch.NumBranches | 5178 | 4752 | -426 | -8.23% | 8.23% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 503 | -411 | -44.97% | 44.97% |
| simple-loop-unswitch.NumSwitches | 20 | 18 | -2 | -10.00% | 10.00% |
| simple-loop-unswitch.NumTrivial | 183 | 95 | -88 | -48.09% | 48.09% |
I.e. we end up with less instructions, less peeling, more LICM activity,
also note how none of those 4 regressions are here. Namely:
| statistic name | LICM-LoopRotate | LICM-LoopRotate-LICM | Δ | % | abs(%) |
| asm-printer.EmittedInsts | 9015799 | 9014474 | -1325 | -0.01% | 0.01% |
| indvars.NumElimCmp | 3544 | 3546 | 2 | 0.06% | 0.06% |
| indvars.NumElimExt | 36580 | 36681 | 101 | 0.28% | 0.28% |
| indvars.NumElimIV | 1187 | 1185 | -2 | -0.17% | 0.17% |
| indvars.NumElimIdentity | 136 | 146 | 10 | 7.35% | 7.35% |
| indvars.NumLFTR | 29890 | 29899 | 9 | 0.03% | 0.03% |
| indvars.NumReplaced | 2227 | 2299 | 72 | 3.23% | 3.23% |
| indvars.NumWidened | 26329 | 26404 | 75 | 0.28% | 0.28% |
| instcount.TotalBlocks | 1173840 | 1173652 | -188 | -0.02% | 0.02% |
| instcount.TotalInsts | 9896139 | 9895452 | -687 | -0.01% | 0.01% |
| lcssa.NumLCSSA | 423961 | 425373 | 1412 | 0.33% | 0.33% |
| licm.NumHoisted | 378753 | 383352 | 4599 | 1.21% | 1.21% |
| licm.NumMovedCalls | 2208 | 2204 | -4 | -0.18% | 0.18% |
| licm.NumMovedLoads | 31821 | 35755 | 3934 | 12.36% | 12.36% |
| licm.NumPromoted | 11154 | 11163 | 9 | 0.08% | 0.08% |
| licm.NumSunk | 13587 | 14321 | 734 | 5.40% | 5.40% |
| loop-delete.NumDeleted | 8402 | 8538 | 136 | 1.62% | 1.62% |
| loop-instsimplify.NumSimplified | 11890 | 12041 | 151 | 1.27% | 1.27% |
| loop-peel.NumPeeled | 925 | 924 | -1 | -0.11% | 0.11% |
| loop-rotate.NumRotated | 42003 | 42005 | 2 | 0.00% | 0.00% |
| loop-simplifycfg.NumLoopBlocksDeleted | 242 | 241 | -1 | -0.41% | 0.41% |
| loop-simplifycfg.NumLoopExitsDeleted | 20 | 497 | 477 | 2385.00% | 2385.00% |
| loop-simplifycfg.NumTerminatorsFolded | 336 | 619 | 283 | 84.23% | 84.23% |
| loop-unroll.NumCompletelyUnrolled | 11032 | 11029 | -3 | -0.03% | 0.03% |
| loop-unroll.NumUnrolled | 12529 | 12525 | -4 | -0.03% | 0.03% |
| mem2reg.NumDeadAlloca | 10221 | 10222 | 1 | 0.01% | 0.01% |
| mem2reg.NumPHIInsert | 192106 | 192073 | -33 | -0.02% | 0.02% |
| mem2reg.NumSingleStore | 637643 | 637652 | 9 | 0.00% | 0.00% |
| scalar-evolution.NumBruteForceTripCountsComputed | 812 | 814 | 2 | 0.25% | 0.25% |
| scalar-evolution.NumTripCountsComputed | 282934 | 282998 | 64 | 0.02% | 0.02% |
| scalar-evolution.NumTripCountsNotComputed | 106718 | 106691 | -27 | -0.03% | 0.03% |
| simple-loop-unswitch.NumBranches | 4752 | 5185 | 433 | 9.11% | 9.11% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 503 | 925 | 422 | 83.90% | 83.90% |
| simple-loop-unswitch.NumSwitches | 18 | 20 | 2 | 11.11% | 11.11% |
| simple-loop-unswitch.NumTrivial | 95 | 179 | 84 | 88.42% | 88.42% |
{F15983613} {F15983615} {F15983616}
(this is vanilla llvm testsuite + rawspeed + darktable)
As an example of the code where early LICM only is bad, see:
https://godbolt.org/z/GzEbacs4K
This does have an observable compile-time regression of +~0.5% geomean
https://llvm-compile-time-tracker.com/compare.php?from=7c5222e4d1a3a14f029e5f614c9aefd0fa505f1e&to=5d81826c3411982ca26e46b9d0aff34c80577664&stat=instructions
but i think that's basically nothing, and there's potential that it might
be avoidable in the future by fixing clang to produce alignment information
on function arguments, thus making the second run unneeded.
Differential Revision: https://reviews.llvm.org/D99249
2021-04-02 09:40:12 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
|
|
|
|
; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
|
|
|
|
; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
|
2021-01-15 22:56:57 +01:00
|
|
|
; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
|
2020-07-29 02:08:24 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: InlinerPass
|
2021-01-15 22:56:57 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: InlinerPass
|
2018-05-08 03:45:46 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
|
2021-04-27 17:56:11 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: AAManager
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
|
2021-03-24 15:11:32 +01:00
|
|
|
; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
|
|
|
|
; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SROA
|
|
|
|
; CHECK-O-NEXT: Running pass: EarlyCSEPass
|
2017-06-28 00:25:02 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
|
2019-11-27 05:28:52 +01:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
|
|
|
|
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
|
|
|
|
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
|
|
|
|
; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
|
2020-07-30 19:14:02 +02:00
|
|
|
; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
|
2018-01-25 13:06:32 +01:00
|
|
|
; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
|
|
|
; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
|
|
|
|
; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
|
|
|
|
; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
|
2019-11-27 05:28:52 +01:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: TailCallElimPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
|
|
|
|
; CHECK-O-NEXT: Running pass: ReassociatePass
|
|
|
|
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
|
2017-12-29 09:16:06 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: LoopAnalysis
|
2017-12-29 09:16:06 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: LCSSAPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis
|
|
|
|
; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
|
2018-05-30 04:46:45 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopInstSimplifyPass
|
|
|
|
; CHECK-O-NEXT: Running pass: LoopSimplifyCFGPass
|
[PassManager] Run additional LICM before LoopRotate
Loop rotation often has to perform code duplication
from header into preheader, which introduces PHI nodes.
>>! In D99204, @thopre wrote:
>
> With loop peeling, it is important that unnecessary PHIs be avoided or
> it will leads to spurious peeling. One source of such PHIs is loop
> rotation which creates PHIs for invariant loads. Those PHIs are
> particularly problematic since loop peeling is now run as part of simple
> loop unrolling before GVN is run, and are thus a source of spurious
> peeling.
>
> Note that while some of the load can be hoisted and eventually
> eliminated by instruction combine, this is not always possible due to
> alignment issue. In particular, the motivating example [1] was a load
> inside a class instance which cannot be hoisted because the `this'
> pointer has an alignment of 1.
>
> [1] http://lists.llvm.org/pipermail/llvm-dev/attachments/20210312/4ce73c47/attachment.cpp
Now, we could enhance LoopRotate to avoid duplicating code when not needed,
but instead hoist loop-invariant code, but isn't that a code duplication? (*sic*)
We have LICM, and in fact we already run it right after LoopRotation.
We could try to move it to before LoopRotation,
that is basically free from compile-time perspective:
https://llvm-compile-time-tracker.com/compare.php?from=6c93eb4477d88af046b915bc955c03693b2cbb58&to=a4bee6d07732b1184c436da489040b912f0dc271&stat=instructions
But, looking at stats, i think it isn't great that we would no longer do LICM after LoopRotation, in particular:
| statistic name | LoopRotate-LICM | LICM-LoopRotate | Δ | % | abs(%) |
| asm-printer.EmittedInsts | 9015930 | 9015799 | -131 | 0.00% | 0.00% |
| indvars.NumElimCmp | 3536 | 3544 | 8 | 0.23% | 0.23% |
| indvars.NumElimExt | 36725 | 36580 | -145 | -0.39% | 0.39% |
| indvars.NumElimIV | 1197 | 1187 | -10 | -0.84% | 0.84% |
| indvars.NumElimIdentity | 143 | 136 | -7 | -4.90% | 4.90% |
| indvars.NumElimRem | 4 | 5 | 1 | 25.00% | 25.00% |
| indvars.NumLFTR | 29842 | 29890 | 48 | 0.16% | 0.16% |
| indvars.NumReplaced | 2293 | 2227 | -66 | -2.88% | 2.88% |
| indvars.NumSimplifiedSDiv | 6 | 8 | 2 | 33.33% | 33.33% |
| indvars.NumWidened | 26438 | 26329 | -109 | -0.41% | 0.41% |
| instcount.TotalBlocks | 1178338 | 1173840 | -4498 | -0.38% | 0.38% |
| instcount.TotalFuncs | 111825 | 111829 | 4 | 0.00% | 0.00% |
| instcount.TotalInsts | 9905442 | 9896139 | -9303 | -0.09% | 0.09% |
| lcssa.NumLCSSA | 425871 | 423961 | -1910 | -0.45% | 0.45% |
| licm.NumHoisted | 378357 | 378753 | 396 | 0.10% | 0.10% |
| licm.NumMovedCalls | 2193 | 2208 | 15 | 0.68% | 0.68% |
| licm.NumMovedLoads | 35899 | 31821 | -4078 | -11.36% | 11.36% |
| licm.NumPromoted | 11178 | 11154 | -24 | -0.21% | 0.21% |
| licm.NumSunk | 13359 | 13587 | 228 | 1.71% | 1.71% |
| loop-delete.NumDeleted | 8547 | 8402 | -145 | -1.70% | 1.70% |
| loop-instsimplify.NumSimplified | 12876 | 11890 | -986 | -7.66% | 7.66% |
| loop-peel.NumPeeled | 1008 | 925 | -83 | -8.23% | 8.23% |
| loop-rotate.NumNotRotatedDueToHeaderSize | 368 | 365 | -3 | -0.82% | 0.82% |
| loop-rotate.NumRotated | 42015 | 42003 | -12 | -0.03% | 0.03% |
| loop-simplifycfg.NumLoopBlocksDeleted | 240 | 242 | 2 | 0.83% | 0.83% |
| loop-simplifycfg.NumLoopExitsDeleted | 497 | 20 | -477 | -95.98% | 95.98% |
| loop-simplifycfg.NumTerminatorsFolded | 618 | 336 | -282 | -45.63% | 45.63% |
| loop-unroll.NumCompletelyUnrolled | 11028 | 11032 | 4 | 0.04% | 0.04% |
| loop-unroll.NumUnrolled | 12608 | 12529 | -79 | -0.63% | 0.63% |
| mem2reg.NumDeadAlloca | 10222 | 10221 | -1 | -0.01% | 0.01% |
| mem2reg.NumPHIInsert | 192110 | 192106 | -4 | 0.00% | 0.00% |
| mem2reg.NumSingleStore | 637650 | 637643 | -7 | 0.00% | 0.00% |
| scalar-evolution.NumBruteForceTripCountsComputed | 814 | 812 | -2 | -0.25% | 0.25% |
| scalar-evolution.NumTripCountsComputed | 283108 | 282934 | -174 | -0.06% | 0.06% |
| scalar-evolution.NumTripCountsNotComputed | 106712 | 106718 | 6 | 0.01% | 0.01% |
| simple-loop-unswitch.NumBranches | 5178 | 4752 | -426 | -8.23% | 8.23% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 503 | -411 | -44.97% | 44.97% |
| simple-loop-unswitch.NumSwitches | 20 | 18 | -2 | -10.00% | 10.00% |
| simple-loop-unswitch.NumTrivial | 183 | 95 | -88 | -48.09% | 48.09% |
... but that actually regresses LICM (-12% `licm.NumMovedLoads`),
loop-simplifycfg (`NumLoopExitsDeleted`, `NumTerminatorsFolded`),
simple-loop-unswitch (`NumTrivial`).
What if we instead have LICM both before and after LoopRotate?
| statistic name | LoopRotate-LICM | LICM-LoopRotate-LICM | Δ | % | abs(%) |
| asm-printer.EmittedInsts | 9015930 | 9014474 | -1456 | -0.02% | 0.02% |
| indvars.NumElimCmp | 3536 | 3546 | 10 | 0.28% | 0.28% |
| indvars.NumElimExt | 36725 | 36681 | -44 | -0.12% | 0.12% |
| indvars.NumElimIV | 1197 | 1185 | -12 | -1.00% | 1.00% |
| indvars.NumElimIdentity | 143 | 146 | 3 | 2.10% | 2.10% |
| indvars.NumElimRem | 4 | 5 | 1 | 25.00% | 25.00% |
| indvars.NumLFTR | 29842 | 29899 | 57 | 0.19% | 0.19% |
| indvars.NumReplaced | 2293 | 2299 | 6 | 0.26% | 0.26% |
| indvars.NumSimplifiedSDiv | 6 | 8 | 2 | 33.33% | 33.33% |
| indvars.NumWidened | 26438 | 26404 | -34 | -0.13% | 0.13% |
| instcount.TotalBlocks | 1178338 | 1173652 | -4686 | -0.40% | 0.40% |
| instcount.TotalFuncs | 111825 | 111829 | 4 | 0.00% | 0.00% |
| instcount.TotalInsts | 9905442 | 9895452 | -9990 | -0.10% | 0.10% |
| lcssa.NumLCSSA | 425871 | 425373 | -498 | -0.12% | 0.12% |
| licm.NumHoisted | 378357 | 383352 | 4995 | 1.32% | 1.32% |
| licm.NumMovedCalls | 2193 | 2204 | 11 | 0.50% | 0.50% |
| licm.NumMovedLoads | 35899 | 35755 | -144 | -0.40% | 0.40% |
| licm.NumPromoted | 11178 | 11163 | -15 | -0.13% | 0.13% |
| licm.NumSunk | 13359 | 14321 | 962 | 7.20% | 7.20% |
| loop-delete.NumDeleted | 8547 | 8538 | -9 | -0.11% | 0.11% |
| loop-instsimplify.NumSimplified | 12876 | 12041 | -835 | -6.48% | 6.48% |
| loop-peel.NumPeeled | 1008 | 924 | -84 | -8.33% | 8.33% |
| loop-rotate.NumNotRotatedDueToHeaderSize | 368 | 365 | -3 | -0.82% | 0.82% |
| loop-rotate.NumRotated | 42015 | 42005 | -10 | -0.02% | 0.02% |
| loop-simplifycfg.NumLoopBlocksDeleted | 240 | 241 | 1 | 0.42% | 0.42% |
| loop-simplifycfg.NumTerminatorsFolded | 618 | 619 | 1 | 0.16% | 0.16% |
| loop-unroll.NumCompletelyUnrolled | 11028 | 11029 | 1 | 0.01% | 0.01% |
| loop-unroll.NumUnrolled | 12608 | 12525 | -83 | -0.66% | 0.66% |
| mem2reg.NumPHIInsert | 192110 | 192073 | -37 | -0.02% | 0.02% |
| mem2reg.NumSingleStore | 637650 | 637652 | 2 | 0.00% | 0.00% |
| scalar-evolution.NumTripCountsComputed | 283108 | 282998 | -110 | -0.04% | 0.04% |
| scalar-evolution.NumTripCountsNotComputed | 106712 | 106691 | -21 | -0.02% | 0.02% |
| simple-loop-unswitch.NumBranches | 5178 | 5185 | 7 | 0.14% | 0.14% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 925 | 11 | 1.20% | 1.20% |
| simple-loop-unswitch.NumTrivial | 183 | 179 | -4 | -2.19% | 2.19% |
| simple-loop-unswitch.NumBranches | 5178 | 4752 | -426 | -8.23% | 8.23% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 914 | 503 | -411 | -44.97% | 44.97% |
| simple-loop-unswitch.NumSwitches | 20 | 18 | -2 | -10.00% | 10.00% |
| simple-loop-unswitch.NumTrivial | 183 | 95 | -88 | -48.09% | 48.09% |
I.e. we end up with less instructions, less peeling, more LICM activity,
also note how none of those 4 regressions are here. Namely:
| statistic name | LICM-LoopRotate | LICM-LoopRotate-LICM | Δ | % | abs(%) |
| asm-printer.EmittedInsts | 9015799 | 9014474 | -1325 | -0.01% | 0.01% |
| indvars.NumElimCmp | 3544 | 3546 | 2 | 0.06% | 0.06% |
| indvars.NumElimExt | 36580 | 36681 | 101 | 0.28% | 0.28% |
| indvars.NumElimIV | 1187 | 1185 | -2 | -0.17% | 0.17% |
| indvars.NumElimIdentity | 136 | 146 | 10 | 7.35% | 7.35% |
| indvars.NumLFTR | 29890 | 29899 | 9 | 0.03% | 0.03% |
| indvars.NumReplaced | 2227 | 2299 | 72 | 3.23% | 3.23% |
| indvars.NumWidened | 26329 | 26404 | 75 | 0.28% | 0.28% |
| instcount.TotalBlocks | 1173840 | 1173652 | -188 | -0.02% | 0.02% |
| instcount.TotalInsts | 9896139 | 9895452 | -687 | -0.01% | 0.01% |
| lcssa.NumLCSSA | 423961 | 425373 | 1412 | 0.33% | 0.33% |
| licm.NumHoisted | 378753 | 383352 | 4599 | 1.21% | 1.21% |
| licm.NumMovedCalls | 2208 | 2204 | -4 | -0.18% | 0.18% |
| licm.NumMovedLoads | 31821 | 35755 | 3934 | 12.36% | 12.36% |
| licm.NumPromoted | 11154 | 11163 | 9 | 0.08% | 0.08% |
| licm.NumSunk | 13587 | 14321 | 734 | 5.40% | 5.40% |
| loop-delete.NumDeleted | 8402 | 8538 | 136 | 1.62% | 1.62% |
| loop-instsimplify.NumSimplified | 11890 | 12041 | 151 | 1.27% | 1.27% |
| loop-peel.NumPeeled | 925 | 924 | -1 | -0.11% | 0.11% |
| loop-rotate.NumRotated | 42003 | 42005 | 2 | 0.00% | 0.00% |
| loop-simplifycfg.NumLoopBlocksDeleted | 242 | 241 | -1 | -0.41% | 0.41% |
| loop-simplifycfg.NumLoopExitsDeleted | 20 | 497 | 477 | 2385.00% | 2385.00% |
| loop-simplifycfg.NumTerminatorsFolded | 336 | 619 | 283 | 84.23% | 84.23% |
| loop-unroll.NumCompletelyUnrolled | 11032 | 11029 | -3 | -0.03% | 0.03% |
| loop-unroll.NumUnrolled | 12529 | 12525 | -4 | -0.03% | 0.03% |
| mem2reg.NumDeadAlloca | 10221 | 10222 | 1 | 0.01% | 0.01% |
| mem2reg.NumPHIInsert | 192106 | 192073 | -33 | -0.02% | 0.02% |
| mem2reg.NumSingleStore | 637643 | 637652 | 9 | 0.00% | 0.00% |
| scalar-evolution.NumBruteForceTripCountsComputed | 812 | 814 | 2 | 0.25% | 0.25% |
| scalar-evolution.NumTripCountsComputed | 282934 | 282998 | 64 | 0.02% | 0.02% |
| scalar-evolution.NumTripCountsNotComputed | 106718 | 106691 | -27 | -0.03% | 0.03% |
| simple-loop-unswitch.NumBranches | 4752 | 5185 | 433 | 9.11% | 9.11% |
| simple-loop-unswitch.NumCostMultiplierSkipped | 503 | 925 | 422 | 83.90% | 83.90% |
| simple-loop-unswitch.NumSwitches | 18 | 20 | 2 | 11.11% | 11.11% |
| simple-loop-unswitch.NumTrivial | 95 | 179 | 84 | 88.42% | 88.42% |
{F15983613} {F15983615} {F15983616}
(this is vanilla llvm testsuite + rawspeed + darktable)
As an example of the code where early LICM only is bad, see:
https://godbolt.org/z/GzEbacs4K
This does have an observable compile-time regression of +~0.5% geomean
https://llvm-compile-time-tracker.com/compare.php?from=7c5222e4d1a3a14f029e5f614c9aefd0fa505f1e&to=5d81826c3411982ca26e46b9d0aff34c80577664&stat=instructions
but i think that's basically nothing, and there's potential that it might
be avoidable in the future by fixing clang to produce alignment information
on function arguments, thus making the second run unneeded.
Differential Revision: https://reviews.llvm.org/D99249
2021-04-02 09:40:12 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LICM
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopRotatePass
|
|
|
|
; CHECK-O-NEXT: Running pass: LICM
|
|
|
|
; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
|
|
|
|
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
|
|
|
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
2017-12-29 09:16:06 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopSimplifyPass
|
|
|
|
; CHECK-O-NEXT: Running pass: LCSSAPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopIdiomRecognizePass
|
[PassManager] Run Induction Variable Simplification pass *after* Recognize loop idioms pass, not before
Currently, `-indvars` runs first, and then immediately after `-loop-idiom` does.
I'm not really sure if `-loop-idiom` requires `-indvars` to run beforehand,
but i'm *very* sure that `-indvars` requires `-loop-idiom` to run afterwards,
as it can be seen in the phase-ordering test.
LoopIdiom runs on two types of loops: countable ones, and uncountable ones.
For uncountable ones, IndVars obviously didn't make any change to them,
since they are uncountable, so for them the order should be irrelevant.
For countable ones, well, they should have been countable before IndVars
for IndVars to make any change to them, and since SCEV is used on them,
it shouldn't matter if IndVars have already canonicalized them.
So i don't really see why we'd want the current ordering.
Should this cause issues, it will give us a reproducer test case
that shows flaws in this logic, and we then could adjust accordingly.
While this is quite likely beneficial in-the-wild already,
it's a required part for the full motivational pattern
behind `left-shift-until-bittest` loop idiom (D91038).
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D91800
2020-11-25 17:17:25 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: IndVarSimplifyPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopDeletionPass
|
2017-08-02 22:35:29 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: LoopFullUnrollPass
|
2019-11-01 22:59:08 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: SROA on foo
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
|
|
|
|
; CHECK-Os-NEXT: Running pass: GVN
|
2018-10-01 20:57:08 +02:00
|
|
|
; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis
|
|
|
|
; CHECK-Os-NEXT: Running analysis: PhiValuesAnalysis
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-Oz-NEXT: Running pass: MergedLoadStoreMotionPass
|
|
|
|
; CHECK-Oz-NEXT: Running pass: GVN
|
2018-10-01 20:57:08 +02:00
|
|
|
; CHECK-Oz-NEXT: Running analysis: MemoryDependenceAnalysis
|
|
|
|
; CHECK-Oz-NEXT: Running analysis: PhiValuesAnalysis
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O2-NEXT: Running pass: MergedLoadStoreMotionPass
|
|
|
|
; CHECK-O2-NEXT: Running pass: GVN
|
2018-10-01 20:57:08 +02:00
|
|
|
; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
|
|
|
|
; CHECK-O2-NEXT: Running analysis: PhiValuesAnalysis
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O3-NEXT: Running pass: MergedLoadStoreMotionPass
|
|
|
|
; CHECK-O3-NEXT: Running pass: GVN
|
2018-10-01 20:57:08 +02:00
|
|
|
; CHECK-O3-NEXT: Running analysis: MemoryDependenceAnalysis
|
|
|
|
; CHECK-O3-NEXT: Running analysis: PhiValuesAnalysis
|
2021-01-10 10:52:01 +01:00
|
|
|
; CHECK-O1-NEXT: Running pass: MemCpyOptPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SCCPPass
|
|
|
|
; CHECK-O-NEXT: Running pass: BDCEPass
|
|
|
|
; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
|
|
|
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
2019-11-27 05:28:52 +01:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
|
2020-07-30 19:14:02 +02:00
|
|
|
; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
|
2019-11-27 05:28:52 +01:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
|
2020-07-30 19:14:02 +02:00
|
|
|
; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
|
2021-07-15 08:31:31 +02:00
|
|
|
; CHECK-O1-NEXT: Running pass: CoroElidePass
|
2020-10-21 11:21:50 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: ADCEPass
|
|
|
|
; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
|
2021-01-10 10:52:01 +01:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: MemCpyOptPass
|
2019-11-27 05:28:52 +01:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: DSEPass
|
|
|
|
; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
|
|
|
|
; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
|
2020-07-29 02:08:24 +02:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: LICMPass on Loop at depth 1 containing: %loop
|
2021-07-15 08:31:31 +02:00
|
|
|
; CHECK-O23SZ-NEXT: Running pass: CoroElidePass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
|
|
|
|
; CHECK-O-NEXT: Running pass: InstCombinePass
|
2021-07-15 08:31:31 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: CoroSplitPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-PRELINK-O-NEXT: Running pass: GlobalOptPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: GlobalOptPass
|
2017-10-05 20:36:01 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: EliminateAvailableExternallyPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: Float2IntPass
|
2019-10-14 18:15:14 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LowerConstantIntrinsicsPass
|
2019-06-08 17:37:47 +02:00
|
|
|
; CHECK-EXT: Running pass: {{.*}}::Bye
|
2017-12-29 09:16:06 +01:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass
|
2020-07-29 02:08:24 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopRotatePass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopDistributePass
|
2020-04-06 20:16:48 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
|
2020-01-10 05:58:31 +01:00
|
|
|
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
|
|
|
|
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
|
|
|
|
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
|
2020-05-22 18:13:18 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
2017-08-02 22:35:29 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
|
[Unroll/UnrollAndJam/Vectorizer/Distribute] Add followup loop attributes.
When multiple loop transformation are defined in a loop's metadata, their order of execution is defined by the order of their respective passes in the pass pipeline. For instance, e.g.
#pragma clang loop unroll_and_jam(enable)
#pragma clang loop distribute(enable)
is the same as
#pragma clang loop distribute(enable)
#pragma clang loop unroll_and_jam(enable)
and will try to loop-distribute before Unroll-And-Jam because the LoopDistribute pass is scheduled after UnrollAndJam pass. UnrollAndJamPass only supports one inner loop, i.e. it will necessarily fail after loop distribution. It is not possible to specify another execution order. Also,t the order of passes in the pipeline is subject to change between versions of LLVM, optimization options and which pass manager is used.
This patch adds 'followup' attributes to various loop transformation passes. These attributes define which attributes the resulting loop of a transformation should have. For instance,
!0 = !{!0, !1, !2}
!1 = !{!"llvm.loop.unroll_and_jam.enable"}
!2 = !{!"llvm.loop.unroll_and_jam.followup_inner", !3}
!3 = !{!"llvm.loop.distribute.enable"}
defines a loop ID (!0) to be unrolled-and-jammed (!1) and then the attribute !3 to be added to the jammed inner loop, which contains the instruction to distribute the inner loop.
Currently, in both pass managers, pass execution is in a fixed order and UnrollAndJamPass will not execute again after LoopDistribute. We hope to fix this in the future by allowing pass managers to run passes until a fixpoint is reached, use Polly to perform these transformations, or add a loop transformation pass which takes the order issue into account.
For mandatory/forced transformations (e.g. by having been declared by #pragma omp simd), the user must be notified when a transformation could not be performed. It is not possible that the responsible pass emits such a warning because the transformation might be 'hidden' in a followup attribute when it is executed, or it is not present in the pipeline at all. For this reason, this patche introduces a WarnMissedTransformations pass, to warn about orphaned transformations.
Since this changes the user-visible diagnostic message when a transformation is applied, two test cases in the clang repository need to be updated.
To ensure that no other transformation is executed before the intended one, the attribute `llvm.loop.disable_nonforced` can be added which should disable transformation heuristics before the intended transformation is applied. E.g. it would be surprising if a loop is distributed before a #pragma unroll_and_jam is applied.
With more supported code transformations (loop fusion, interchange, stripmining, offloading, etc.), transformations can be used as building blocks for more complex transformations (e.g. stripmining+stripmining+interchange -> tiling).
Reviewed By: hfinkel, dmgreen
Differential Revision: https://reviews.llvm.org/D49281
Differential Revision: https://reviews.llvm.org/D55288
llvm-svn: 348944
2018-12-12 18:32:52 +01:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
|
2017-12-29 09:16:06 +01:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LCSSAPass
|
2020-07-29 02:08:24 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LICMPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass
|
2018-06-30 01:36:03 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifyPass
|
2017-09-09 15:38:18 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
|
2021-07-15 08:31:31 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: CoroCleanupPass
|
2018-07-16 02:28:24 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: CGProfilePass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass
|
2020-12-29 22:32:13 +01:00
|
|
|
; CHECK-POSTLINK-O-NEXT: Running pass: RelLookupTableConverterPass
|
|
|
|
; CHECK-POSTLINK-O-NEXT: Running analysis: TargetIRAnalysis
|
2020-11-13 10:46:55 +01:00
|
|
|
; CHECK-O-NEXT: Running pass: AnnotationRemarksPass on foo
|
2020-11-16 21:48:42 +01:00
|
|
|
; CHECK-PRELINK-O-NEXT: Running pass: CanonicalizeAliasesPass
|
2017-06-30 01:08:38 +02:00
|
|
|
; CHECK-PRELINK-O-NEXT: Running pass: NameAnonGlobalPass
|
2017-06-01 13:39:39 +02:00
|
|
|
; CHECK-O-NEXT: Running pass: PrintModulePass
|
|
|
|
|
|
|
|
; Make sure we get the IR back out without changes when we print the module.
|
|
|
|
; CHECK-O-LABEL: define void @foo(i32 %n) local_unnamed_addr {
|
|
|
|
; CHECK-O-NEXT: entry:
|
|
|
|
; CHECK-O-NEXT: br label %loop
|
|
|
|
; CHECK-O: loop:
|
|
|
|
; CHECK-O-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
|
|
|
|
; CHECK-O-NEXT: %iv.next = add i32 %iv, 1
|
|
|
|
; CHECK-O-NEXT: tail call void @bar()
|
|
|
|
; CHECK-O-NEXT: %cmp = icmp eq i32 %iv, %n
|
|
|
|
; CHECK-O-NEXT: br i1 %cmp, label %exit, label %loop
|
|
|
|
; CHECK-O: exit:
|
|
|
|
; CHECK-O-NEXT: ret void
|
|
|
|
; CHECK-O-NEXT: }
|
|
|
|
;
|
|
|
|
|
|
|
|
declare void @bar() local_unnamed_addr
|
|
|
|
|
|
|
|
define void @foo(i32 %n) local_unnamed_addr {
|
|
|
|
entry:
|
|
|
|
br label %loop
|
|
|
|
loop:
|
|
|
|
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
|
|
|
|
%iv.next = add i32 %iv, 1
|
|
|
|
tail call void @bar()
|
|
|
|
%cmp = icmp eq i32 %iv, %n
|
|
|
|
br i1 %cmp, label %exit, label %loop
|
|
|
|
exit:
|
|
|
|
ret void
|
|
|
|
}
|