mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[LV] Optimize for size when vectorizing loops with tiny trip count
It may be detrimental to vectorize loops with very small trip count, as various costs of the vectorized loop body as well as enclosing overheads including runtime tests and scalar iterations may outweigh the gains of vectorizing. The current cost model measures the cost of the vectorized loop body only, expecting it will amortize other costs, and loops with known or expected very small trip counts are not vectorized at all. This patch allows loops with very small trip counts to be vectorized, but under OptForSize constraints, which ensure the cost of the loop body is dominant, having no runtime guards nor scalar iterations. Patch inspired by D32451. Differential Revision: https://reviews.llvm.org/D34373 llvm-svn: 306803
This commit is contained in:
parent
74d6ad5a45
commit
fbea2e7ae6
@ -114,12 +114,13 @@ static cl::opt<bool>
|
|||||||
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
|
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
|
||||||
cl::desc("Enable if-conversion during vectorization."));
|
cl::desc("Enable if-conversion during vectorization."));
|
||||||
|
|
||||||
/// We don't vectorize loops with a known constant trip count below this number.
|
/// Loops with a known constant trip count below this number are vectorized only
|
||||||
|
/// if no scalar iteration overheads are incurred.
|
||||||
static cl::opt<unsigned> TinyTripCountVectorThreshold(
|
static cl::opt<unsigned> TinyTripCountVectorThreshold(
|
||||||
"vectorizer-min-trip-count", cl::init(16), cl::Hidden,
|
"vectorizer-min-trip-count", cl::init(16), cl::Hidden,
|
||||||
cl::desc("Don't vectorize loops with a constant "
|
cl::desc("Loops with a constant trip count that is smaller than this "
|
||||||
"trip count that is smaller than this "
|
"value are vectorized only if no scalar iteration overheads "
|
||||||
"value."));
|
"are incurred."));
|
||||||
|
|
||||||
static cl::opt<bool> MaximizeBandwidth(
|
static cl::opt<bool> MaximizeBandwidth(
|
||||||
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
|
||||||
@ -7801,34 +7802,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check the loop for a trip count threshold:
|
|
||||||
// do not vectorize loops with a tiny trip count.
|
|
||||||
unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
|
|
||||||
bool HasExpectedTC = (ExpectedTC > 0);
|
|
||||||
|
|
||||||
if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
|
|
||||||
auto EstimatedTC = getLoopEstimatedTripCount(L);
|
|
||||||
if (EstimatedTC) {
|
|
||||||
ExpectedTC = *EstimatedTC;
|
|
||||||
HasExpectedTC = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
|
|
||||||
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
|
|
||||||
<< "This loop is not worth vectorizing.");
|
|
||||||
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
|
|
||||||
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
|
|
||||||
else {
|
|
||||||
DEBUG(dbgs() << "\n");
|
|
||||||
ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
|
|
||||||
"NotBeneficial", L)
|
|
||||||
<< "vectorization is not beneficial "
|
|
||||||
"and is not explicitly forced");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PredicatedScalarEvolution PSE(*SE, *L);
|
PredicatedScalarEvolution PSE(*SE, *L);
|
||||||
|
|
||||||
// Check if it is legal to vectorize the loop.
|
// Check if it is legal to vectorize the loop.
|
||||||
@ -7846,6 +7819,34 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
|||||||
bool OptForSize =
|
bool OptForSize =
|
||||||
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
|
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
|
||||||
|
|
||||||
|
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
|
||||||
|
// count by optimizing for size, to minimize overheads.
|
||||||
|
unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
|
||||||
|
bool HasExpectedTC = (ExpectedTC > 0);
|
||||||
|
|
||||||
|
if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
|
||||||
|
auto EstimatedTC = getLoopEstimatedTripCount(L);
|
||||||
|
if (EstimatedTC) {
|
||||||
|
ExpectedTC = *EstimatedTC;
|
||||||
|
HasExpectedTC = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
|
||||||
|
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
|
||||||
|
<< "This loop is worth vectorizing only if no scalar "
|
||||||
|
<< "iteration overheads are incurred.");
|
||||||
|
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
|
||||||
|
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
|
||||||
|
else {
|
||||||
|
DEBUG(dbgs() << "\n");
|
||||||
|
// Loops with a very small trip count are considered for vectorization
|
||||||
|
// under OptForSize, thereby making sure the cost of their loop body is
|
||||||
|
// dominant, free of runtime guards and scalar iteration overheads.
|
||||||
|
OptForSize = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check the function attributes to see if implicit floats are allowed.
|
// Check the function attributes to see if implicit floats are allowed.
|
||||||
// FIXME: This check doesn't seem possibly correct -- what if the loop is
|
// FIXME: This check doesn't seem possibly correct -- what if the loop is
|
||||||
// an integer loop and the vector instructions selected are purely integer
|
// an integer loop and the vector instructions selected are purely integer
|
||||||
|
@ -3,10 +3,11 @@
|
|||||||
|
|
||||||
; CHECK: LV: Loop hints: force=enabled
|
; CHECK: LV: Loop hints: force=enabled
|
||||||
; CHECK: LV: Loop hints: force=?
|
; CHECK: LV: Loop hints: force=?
|
||||||
|
; CHECK: LV: Loop hints: force=?
|
||||||
; No more loops in the module
|
; No more loops in the module
|
||||||
; CHECK-NOT: LV: Loop hints: force=
|
; CHECK-NOT: LV: Loop hints: force=
|
||||||
; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization
|
; CHECK: 3 loop-vectorize - Number of loops analyzed for vectorization
|
||||||
; CHECK: 1 loop-vectorize - Number of loops vectorized
|
; CHECK: 2 loop-vectorize - Number of loops vectorized
|
||||||
|
|
||||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||||
target triple = "x86_64-apple-macosx10.8.0"
|
target triple = "x86_64-apple-macosx10.8.0"
|
||||||
@ -71,3 +72,29 @@ for.end:
|
|||||||
|
|
||||||
!3 = !{!3}
|
!3 = !{!3}
|
||||||
|
|
||||||
|
;
|
||||||
|
; This loop will be vectorized as the trip count is below the threshold but no
|
||||||
|
; scalar iterations are needed.
|
||||||
|
;
|
||||||
|
define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
|
||||||
|
entry:
|
||||||
|
br label %for.body
|
||||||
|
|
||||||
|
for.body:
|
||||||
|
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||||
|
%arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
|
||||||
|
%0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
|
||||||
|
%arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
|
||||||
|
%1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
|
||||||
|
%add = fadd fast float %0, %1
|
||||||
|
store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
|
||||||
|
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||||
|
%exitcond = icmp eq i64 %indvars.iv.next, 16
|
||||||
|
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
|
||||||
|
|
||||||
|
for.end:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
!4 = !{!4}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
|||||||
@c = common global [2048 x i32] zeroinitializer, align 16
|
@c = common global [2048 x i32] zeroinitializer, align 16
|
||||||
|
|
||||||
;CHECK-LABEL: @example1(
|
;CHECK-LABEL: @example1(
|
||||||
;CHECK-NOT: load <4 x i32>
|
;CHECK: load <4 x i32>
|
||||||
;CHECK: ret void
|
;CHECK: ret void
|
||||||
define void @example1() nounwind uwtable ssp {
|
define void @example1() nounwind uwtable ssp {
|
||||||
br label %1
|
br label %1
|
||||||
@ -23,8 +23,8 @@ define void @example1() nounwind uwtable ssp {
|
|||||||
store i32 %6, i32* %7, align 4
|
store i32 %6, i32* %7, align 4
|
||||||
%indvars.iv.next = add i64 %indvars.iv, 1
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||||
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
||||||
%exitcond = icmp eq i32 %lftr.wideiv, 8 ; <----- A really small trip count.
|
%exitcond = icmp eq i32 %lftr.wideiv, 8 ; <----- A really small trip count
|
||||||
br i1 %exitcond, label %8, label %1
|
br i1 %exitcond, label %8, label %1 ; w/o scalar iteration overhead.
|
||||||
|
|
||||||
; <label>:8 ; preds = %1
|
; <label>:8 ; preds = %1
|
||||||
ret void
|
ret void
|
||||||
|
Loading…
x
Reference in New Issue
Block a user