1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[PassManager] adjust VectorCombine placement

The initial placement of vector-combine in the opt pipeline revealed phase ordering bugs:
https://bugs.llvm.org/show_bug.cgi?id=45015
https://bugs.llvm.org/show_bug.cgi?id=42022

This patch contains a few independent changes:

1. Move the pass up in the pipeline, so it happens just after loop-vectorization.
   This is only to keep vectorization passes together in the pipeline at the moment.
   I don't have evidence of interaction between these yet.
2. Add an -early-cse pass after -vector-combine to clean up redundant ops. This was
   partly proposed as far back as rL219644 (which is why it's effectively being moved
   in the old PM code). This is important because the subsequent -instcombine doesn't
   work as well without EarlyCSE. With the CSE, -instcombine is able to squash
   shuffles together in 1 of the tests (because those are simple "select" shuffles).
3. Remove the -vector-combine pass that was running after SLP. We may want to do that
   eventually, but I don't have a test case to support it yet.

Differential Revision: https://reviews.llvm.org/D75145
This commit is contained in:
Sanjay Patel 2020-03-04 11:08:28 -05:00
parent 611aa86281
commit 8f04b72eb3
10 changed files with 33 additions and 54 deletions

View File

@ -966,12 +966,15 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
OptimizePM.addPass(LoopVectorizePass(
LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
// Enhance/cleanup vector code.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(EarlyCSEPass());
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
OptimizePM.addPass(LoopLoadEliminationPass());
// Cleanup after the loop optimization passes.
OptimizePM.addPass(VectorCombinePass());
OptimizePM.addPass(InstCombinePass());
// Now that we've formed fast to execute loop structures, we do further
@ -990,10 +993,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
sinkCommonInsts(true)));
// Optimize parallel scalar instruction chains into SIMD instructions.
if (PTO.SLPVectorization) {
if (PTO.SLPVectorization)
OptimizePM.addPass(SLPVectorizerPass());
OptimizePM.addPass(VectorCombinePass());
}
OptimizePM.addPass(InstCombinePass());

View File

@ -729,6 +729,8 @@ void PassManagerBuilder::populateModulePassManager(
MPM.add(createLoopDistributePass());
MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
MPM.add(createVectorCombinePass());
MPM.add(createEarlyCSEPass());
// Eliminate loads by forwarding stores from the previous iteration to loads
// of the current iteration.
@ -739,7 +741,6 @@ void PassManagerBuilder::populateModulePassManager(
// on -O1 and no #pragma is found). Would be good to have these two passes
// as function calls, so that we can only pass them when the vectorizer
// changed the code.
MPM.add(createVectorCombinePass());
addInstructionCombiningPass(MPM);
if (OptLevel > 1 && ExtraVectorizerPasses) {
// At higher optimization levels, try to clean up any runtime overlap and
@ -748,7 +749,6 @@ void PassManagerBuilder::populateModulePassManager(
// common computations, hoist loop-invariant aspects out of any outer loop,
// and unswitch the runtime checks if possible. Once hoisted, we may have
// dead (or speculatable) control flows or more combining opportunities.
MPM.add(createEarlyCSEPass());
MPM.add(createCorrelatedValuePropagationPass());
addInstructionCombiningPass(MPM);
MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
@ -766,7 +766,6 @@ void PassManagerBuilder::populateModulePassManager(
if (SLPVectorize) {
MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
MPM.add(createVectorCombinePass());
if (OptLevel > 1 && ExtraVectorizerPasses) {
MPM.add(createEarlyCSEPass());
}

View File

@ -250,17 +250,15 @@
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O2-NEXT: Running pass: VectorCombinePass
; CHECK-O3-NEXT: Running pass: VectorCombinePass
; CHECK-Os-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -220,17 +220,15 @@
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-POSTLINK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O2-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O3-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-Os-NEXT: Running pass: VectorCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -188,17 +188,15 @@
; CHECK-O-NEXT: Finished {{.*}}Function pass manager run
; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O2-NEXT: Running pass: VectorCombinePass
; CHECK-O3-NEXT: Running pass: VectorCombinePass
; CHECK-Os-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -199,17 +199,15 @@
; CHECK-O-NEXT: Finished {{.*}}Function pass manager run
; CHECK-O-NEXT: Running pass: LoopDistributePass
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: EarlyCSEPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis
; CHECK-O-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O2-NEXT: Running pass: SLPVectorizerPass
; CHECK-O3-NEXT: Running pass: SLPVectorizerPass
; CHECK-Os-NEXT: Running pass: SLPVectorizerPass
; CHECK-O2-NEXT: Running pass: VectorCombinePass
; CHECK-O3-NEXT: Running pass: VectorCombinePass
; CHECK-Os-NEXT: Running pass: VectorCombinePass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass

View File

@ -225,6 +225,8 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
@ -232,7 +234,6 @@
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Lazy Branch Probability Analysis
@ -250,8 +251,6 @@
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops

View File

@ -230,6 +230,8 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
@ -237,7 +239,6 @@
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Lazy Branch Probability Analysis
@ -255,8 +256,6 @@
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops

View File

@ -212,6 +212,8 @@
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: Loop Vectorization
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Early CSE
; CHECK-NEXT: Canonicalize natural loops
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
@ -219,7 +221,6 @@
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Lazy Branch Probability Analysis
@ -237,8 +238,6 @@
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimize scalar/vector ops
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions
; CHECK-NEXT: Canonicalize natural loops

View File

@ -4,7 +4,7 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; TODO: Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
; That may require some coordination between VectorCombine, SLP, and other passes.
; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
@ -12,11 +12,7 @@ define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
; CHECK-LABEL: @PR45015(
; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
; CHECK-NEXT: [[T8:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[ARG]], [[ARG1]]
; CHECK-NEXT: [[T12:%.*]] = shufflevector <4 x float> [[T8]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[T12]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
; CHECK-NEXT: ret <4 x float> [[T16]]
;
%t = extractelement <4 x float> %arg, i32 0
@ -45,13 +41,9 @@ define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1) {
; CHECK-LABEL: @add_aggregate(
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A0]], [[B0]]
; CHECK-NEXT: [[RETVAL_0_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1]], [[B1]]
; CHECK-NEXT: [[RETVAL_1_1_INSERT:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[RETVAL_0_1_INSERT]], 0
; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[RETVAL_1_1_INSERT]], 1
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP1]], 0
; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[TMP2]], 1
; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[FCA_1_INSERT]]
;
%a00 = extractelement <2 x float> %a0, i32 0
@ -81,18 +73,16 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[A0]], [[B0]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
; CHECK-NEXT: store float [[TMP4]], float* [[R1]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
; CHECK-NEXT: store float [[TMP6]], float* [[R2]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[A1]], [[B1]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
; CHECK-NEXT: store float [[TMP8]], float* [[R3]], align 4
; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4
; CHECK-NEXT: ret void
;
%a00 = extractelement <2 x float> %a0, i32 0