1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00

[LoopVectorize] Don't use strict reductions when reordering is allowed

If the `-enable-strict-reductions` flag is set to true, then currently we will
always choose to vectorize the loop with strict in-order reductions. This is
not necessary where we allow the reordering of FP operations, such as
when loop hints are passed via metadata.

This patch moves useOrderedReductions so that we can also check whether
loop hints allow reordering, in which case we should use the default
behaviour of vectorizing with unordered reductions.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D103814
This commit is contained in:
Kerry McLaughlin 2021-06-08 09:16:07 +01:00
parent a2815f65e2
commit eef4c454f0
3 changed files with 52 additions and 23 deletions

View File

@ -332,10 +332,6 @@ static cl::opt<bool>
cl::desc("Prefer in-loop vector reductions, " cl::desc("Prefer in-loop vector reductions, "
"overriding the targets preference.")); "overriding the targets preference."));
// FIXME: When loop hints are passed which allow reordering of FP operations,
// we still choose to use strict reductions with this flag. We should instead
// use the default behaviour of vectorizing with unordered reductions if
// reordering is allowed.
cl::opt<bool> EnableStrictReductions( cl::opt<bool> EnableStrictReductions(
"enable-strict-reductions", cl::init(false), cl::Hidden, "enable-strict-reductions", cl::init(false), cl::Hidden,
cl::desc("Enable the vectorisation of loops with in-order (strict) " cl::desc("Enable the vectorisation of loops with in-order (strict) "
@ -558,6 +554,10 @@ public:
/// Fix the non-induction PHIs in the OrigPHIsToFix vector. /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
void fixNonInductionPHIs(VPTransformState &State); void fixNonInductionPHIs(VPTransformState &State);
/// Returns true if the reordering of FP operations is not allowed, but we are
/// able to vectorize with strict in-order reductions for the given RdxDesc.
bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
/// Create a broadcast instruction. This method generates a broadcast /// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction /// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ... /// value. If this is the induction variable then we extend it to N, N+1, ...
@ -1306,6 +1306,15 @@ public:
/// outside. In loop reductions are collected into InLoopReductionChains. /// outside. In loop reductions are collected into InLoopReductionChains.
void collectInLoopReductions(); void collectInLoopReductions();
/// Returns true if we should use strict in-order reductions for the given
/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
/// of FP operations.
bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
return EnableStrictReductions && !Hints->allowReordering() &&
RdxDesc.isOrdered();
}
/// \returns The smallest bitwidth each instruction can be represented with. /// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this /// The vector equivalents of these instructions should be truncated to this
/// type. /// type.
@ -4316,10 +4325,6 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
} }
static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
return EnableStrictReductions && RdxDesc.isOrdered();
}
void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
VPTransformState &State) { VPTransformState &State) {
PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue()); PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@ -4348,7 +4353,7 @@ void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
useOrderedReductions(RdxDesc); Cost->useOrderedReductions(RdxDesc);
for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Part = 0; Part < UF; ++Part) {
if (IsOrdered && Part > 0) if (IsOrdered && Part > 0)
@ -4654,6 +4659,10 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
} }
} }
bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
return Cost->useOrderedReductions(RdxDesc);
}
void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
VPUser &Operands, unsigned UF, VPUser &Operands, unsigned UF,
ElementCount VF, bool IsPtrLoopInvariant, ElementCount VF, bool IsPtrLoopInvariant,
@ -4793,7 +4802,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
bool IsOrdered = State.VF.isVector() && bool IsOrdered = State.VF.isVector() &&
Cost->isInLoopReduction(cast<PHINode>(PN)) && Cost->isInLoopReduction(cast<PHINode>(PN)) &&
useOrderedReductions(*RdxDesc); Cost->useOrderedReductions(*RdxDesc);
for (unsigned Part = 0; Part < State.UF; ++Part) { for (unsigned Part = 0; Part < State.UF; ++Part) {
// This is phase one of vectorizing PHIs. // This is phase one of vectorizing PHIs.
@ -9486,7 +9495,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
Value *PrevInChain = State.get(getChainOp(), 0); Value *PrevInChain = State.get(getChainOp(), 0);
for (unsigned Part = 0; Part < State.UF; ++Part) { for (unsigned Part = 0; Part < State.UF; ++Part) {
RecurKind Kind = RdxDesc->getRecurrenceKind(); RecurKind Kind = RdxDesc->getRecurrenceKind();
bool IsOrdered = useOrderedReductions(*RdxDesc); bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
Value *NewVecOp = State.get(getVecOp(), Part); Value *NewVecOp = State.get(getVecOp(), Part);
if (VPValue *Cond = getCondOp()) { if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, Part); Value *NewCond = State.get(Cond, Part);

View File

@ -1,6 +1,7 @@
; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
; CHECK-ORDERED-LABEL: @fadd_strict ; CHECK-ORDERED-LABEL: @fadd_strict

View File

@ -1,6 +1,7 @@
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
; CHECK-ORDERED-LABEL: @fadd_strict ; CHECK-ORDERED-LABEL: @fadd_strict
@ -551,10 +552,10 @@ for.end: ; preds = %for.body
; return sum; ; return sum;
;} ;}
; ;
; Note: These tests do not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
; with the -hints-allow-reordering flag set to true.
; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported ; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported
; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
; with the -hints-allow-reordering flag set to true.
define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) { define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
; CHECK-ORDERED-LABEL: @induction_and_reduction ; CHECK-ORDERED-LABEL: @induction_and_reduction
; CHECK-ORDERED-NOT: vector.body ; CHECK-ORDERED-NOT: vector.body
@ -594,25 +595,41 @@ define float @fast_induction_and_reduction(float* nocapture readonly %values, fl
; CHECK-ORDERED: vector.body ; CHECK-ORDERED: vector.body
; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] ; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
; CHECK-ORDERED: %[[STEP_ADD:.*]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* ; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]]) ; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
; CHECK-ORDERED: %[[FADD2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[FADD1]], <4 x float> %[[LOAD2]]) ; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[STEP_ADD]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
; CHECK-ORDERED: for.body ; CHECK-ORDERED: for.body
; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD3:.*]], %for.body ] ; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] ; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float* ; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float*
; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 ; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float* ; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
; CHECK-ORDERED: %[[FADD3]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD3]] ; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
; CHECK-ORDERED: for.end ; CHECK-ORDERED: for.end
; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD3]], %for.body ], [ %[[FADD2]], %middle.block ] ; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ]
; CHECK-ORDERED: ret float %[[RES_PHI]] ; CHECK-ORDERED: ret float %[[RES_PHI]]
; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction ; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
; CHECK-UNORDERED-NOT: vector.body ; CHECK-UNORDERED: vector.ph
; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
; CHECK-UNORDERED: vector.body
; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]]
; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
; CHECK-UNORDERED: middle.block:
; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
; CHECK-UNORDERED: for.body:
; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ]
; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float*
; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
; CHECK-UNORDERED: for.end
; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ]
; CHECK-UNORDERED: ret float %[[RES_PHI]]
; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction ; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
; CHECK-NOT-VECTORIZED-NOT: vector.body ; CHECK-NOT-VECTORIZED-NOT: vector.body
@ -632,13 +649,15 @@ for.body:
%add3 = fadd float %sum.015, %0 %add3 = fadd float %sum.015, %0
%iv.next = add nuw nsw i64 %iv, 1 %iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N %exitcond.not = icmp eq i64 %iv.next, %N
br i1 %exitcond.not, label %for.end, label %for.body br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
for.end: for.end:
ret float %add3 ret float %add3
} }
; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop ; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop
; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
; with the -hints-allow-reordering flag set to true.
define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) { define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) {
; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction ; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction