[LoopVectorize] Don't use strict reductions when reordering is allowed

If the `-enable-strict-reductions` flag is set to true, then currently we will always choose to vectorize the loop with strict in-order reductions. This is not necessary where we allow the reordering of FP operations, such as when loop hints are passed via metadata. This patch moves useOrderedReductions so that we can also check whether loop hints allow reordering, in which case we should use the default behaviour of vectorizing with unordered reductions. Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D103814
2024-11-22 02:33:06 +01:00 · 2021-06-08 09:16:07 +01:00 · 2021-06-08 09:16:07 +01:00 · eef4c454f0
commit eef4c454f0
parent a2815f65e2
3 changed files with 52 additions and 23 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -332,10 +332,6 @@ static cl::opt<bool>
                           cl::desc("Prefer in-loop vector reductions, "
                                    "overriding the targets preference."));

-// FIXME: When loop hints are passed which allow reordering of FP operations,
-// we still choose to use strict reductions with this flag. We should instead
-// use the default behaviour of vectorizing with unordered reductions if
-// reordering is allowed.
 cl::opt<bool> EnableStrictReductions(
    "enable-strict-reductions", cl::init(false), cl::Hidden,
    cl::desc("Enable the vectorisation of loops with in-order (strict) "
@ -558,6 +554,10 @@ public:
  /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
  void fixNonInductionPHIs(VPTransformState &State);

+  /// Returns true if the reordering of FP operations is not allowed, but we are
+  /// able to vectorize with strict in-order reductions for the given RdxDesc.
+  bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
+
  /// Create a broadcast instruction. This method generates a broadcast
  /// instruction (shuffle) for loop invariant values and for the induction
  /// value. If this is the induction variable then we extend it to N, N+1, ...
@ -1306,6 +1306,15 @@ public:
  /// outside. In loop reductions are collected into InLoopReductionChains.
  void collectInLoopReductions();

+  /// Returns true if we should use strict in-order reductions for the given
+  /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
+  /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
+  /// of FP operations.
+  bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+    return EnableStrictReductions && !Hints->allowReordering() &&
+           RdxDesc.isOrdered();
+  }
+
  /// \returns The smallest bitwidth each instruction can be represented with.
  /// The vector equivalents of these instructions should be truncated to this
  /// type.
@ -4316,10 +4325,6 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
 }

-static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
-  return EnableStrictReductions && RdxDesc.isOrdered();
-}
-
 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
                                       VPTransformState &State) {
  PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@ -4348,7 +4353,7 @@ void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
  BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();

  bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
-                   useOrderedReductions(RdxDesc);
+                   Cost->useOrderedReductions(RdxDesc);

  for (unsigned Part = 0; Part < UF; ++Part) {
    if (IsOrdered && Part > 0)
@ -4654,6 +4659,10 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
  }
 }

+bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+  return Cost->useOrderedReductions(RdxDesc);
+}
+
 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
                                   VPUser &Operands, unsigned UF,
                                   ElementCount VF, bool IsPtrLoopInvariant,
@ -4793,7 +4802,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,

    bool IsOrdered = State.VF.isVector() &&
                     Cost->isInLoopReduction(cast<PHINode>(PN)) &&
-                     useOrderedReductions(*RdxDesc);
+                     Cost->useOrderedReductions(*RdxDesc);

    for (unsigned Part = 0; Part < State.UF; ++Part) {
      // This is phase one of vectorizing PHIs.
@ -9486,7 +9495,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
  Value *PrevInChain = State.get(getChainOp(), 0);
  for (unsigned Part = 0; Part < State.UF; ++Part) {
    RecurKind Kind = RdxDesc->getRecurrenceKind();
-    bool IsOrdered = useOrderedReductions(*RdxDesc);
+    bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
    Value *NewVecOp = State.get(getVecOp(), Part);
    if (VPValue *Cond = getCondOp()) {
      Value *NewCond = State.get(Cond, Part);
--- a/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@ -1,6 +1,7 @@
 ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
 ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
 ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true  -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true  -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED

 define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-ORDERED-LABEL: @fadd_strict
--- a/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@ -1,6 +1,7 @@
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
 ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true  -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true  -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED

 define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-ORDERED-LABEL: @fadd_strict
@ -551,10 +552,10 @@ for.end:                                         ; preds = %for.body
 ;  return sum;
 ;}
 ;
-; Note: These tests do not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
-; with the -hints-allow-reordering flag set to true.

 ; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported
+; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
+; with the -hints-allow-reordering flag set to true.
 define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
 ; CHECK-ORDERED-LABEL: @induction_and_reduction
 ; CHECK-ORDERED-NOT: vector.body
@ -594,25 +595,41 @@ define float @fast_induction_and_reduction(float* nocapture readonly %values, fl
 ; CHECK-ORDERED: vector.body
 ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
 ; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
-; CHECK-ORDERED: %[[STEP_ADD:.*]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
 ; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
 ; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
-; CHECK-ORDERED: %[[FADD2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[FADD1]], <4 x float> %[[LOAD2]])
-; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[STEP_ADD]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
 ; CHECK-ORDERED: for.body
-; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD3:.*]], %for.body ]
+; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
 ; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
 ; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float*
 ; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
-; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
-; CHECK-ORDERED: %[[FADD3]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD3]]
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
 ; CHECK-ORDERED: for.end
-; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD3]], %for.body ], [ %[[FADD2]], %middle.block ]
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ]
 ; CHECK-ORDERED: ret float %[[RES_PHI]]

 ; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
-; CHECK-UNORDERED-NOT: vector.body
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]]
+; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ]
+; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES_PHI]]

 ; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
 ; CHECK-NOT-VECTORIZED-NOT: vector.body
@ -632,13 +649,15 @@ for.body:
  %add3 = fadd float %sum.015, %0
  %iv.next = add nuw nsw i64 %iv, 1
  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %for.end, label %for.body
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2

 for.end:
  ret float %add3
 }

 ; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop
+; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
+; with the -hints-allow-reordering flag set to true.
 define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) {

 ; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction