Support generic expansion of ordered vector reduction (PR36732)

Without the fast math flags, the llvm.experimental.vector.reduce.fadd/fmul intrinsic expansions must be expanded in order. This patch scalarizes the reduction, applying the accumulator at the start of the sequence: ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[NumElts-1] Differential Revision: https://reviews.llvm.org/D45366 llvm-svn: 329585
2024-10-19 02:52:53 +02:00 · 2018-04-09 15:44:20 +00:00 · 2018-04-09 15:44:20 +00:00 · 668d706414
commit 668d706414
parent 9ce45136a8
4 changed files with 84 additions and 14 deletions
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@ -509,6 +509,13 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                        LoopSafetyInfo *SafetyInfo,
                        OptimizationRemarkEmitter *ORE = nullptr);

+/// Generates an ordered vector reduction using extracts to reduce the value.
+Value *
+getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op,
+                    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+                        RecurrenceDescriptor::MRK_Invalid,
+                    ArrayRef<Value *> RedOps = None);
+
 /// Generates a vector reduction using shufflevectors to reduce the value.
 Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
                           RecurrenceDescriptor::MinMaxRecurrenceKind
--- a/lib/CodeGen/ExpandReductions.cpp
+++ b/lib/CodeGen/ExpandReductions.cpp
@ -85,6 +85,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {

  for (auto *II : Worklist) {
    IRBuilder<> Builder(II);
+    bool IsOrdered = false;
+    Value *Acc = nullptr;
    Value *Vec = nullptr;
    auto ID = II->getIntrinsicID();
    auto MRK = RecurrenceDescriptor::MRK_Invalid;
@ -92,11 +94,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
    case Intrinsic::experimental_vector_reduce_fadd:
    case Intrinsic::experimental_vector_reduce_fmul:
      // FMFs must be attached to the call, otherwise it's an ordered reduction
-      // and it can't be handled by generating this shuffle sequence.
-      // TODO: Implement scalarization of ordered reductions here for targets
-      // without native support.
+      // and it can't be handled by generating a shuffle sequence.
      if (!II->getFastMathFlags().isFast())
-        continue;
+        IsOrdered = true;
+      Acc = II->getArgOperand(0);
      Vec = II->getArgOperand(1);
      break;
    case Intrinsic::experimental_vector_reduce_add:
@ -118,7 +119,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
    }
    if (!TTI->shouldExpandReduction(II))
      continue;
-    auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+    Value *Rdx =
+        IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
+                  : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
    II->replaceAllUsesWith(Rdx);
    II->eraseFromParent();
    Changed = true;
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@ -1526,6 +1526,38 @@ static Value *addFastMathFlag(Value *V) {
  return V;
 }

+// Helper to generate an ordered reduction.
+Value *
+llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
+                          unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+
+  // Extract and apply reduction ops in ascending order:
+  // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+  Value *Result = Acc;
+  for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+    Value *Ext =
+        Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+                                   "bin.rdx");
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
+                                                    Ext);
+    }
+
+    if (!RedOps.empty())
+      propagateIRFlags(Result, RedOps);
+  }
+
+  return Result;
+}
+
 // Helper to generate a log2 shuffle reduction.
 Value *
 llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
--- a/test/CodeGen/Generic/expand-experimental-reductions.ll
+++ b/test/CodeGen/Generic/expand-experimental-reductions.ll
@ -117,8 +117,15 @@ entry:
 define float @fadd_f32_strict(<4 x float> %vec) {
 ; CHECK-LABEL: @fadd_f32_strict(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
-; CHECK-NEXT:    ret float [[R]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
  %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
@ -128,8 +135,15 @@ entry:
 define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
 ; CHECK-LABEL: @fadd_f32_strict_accum(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
-; CHECK-NEXT:    ret float [[R]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
  %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
@ -169,8 +183,15 @@ entry:
 define float @fmul_f32_strict(<4 x float> %vec) {
 ; CHECK-LABEL: @fmul_f32_strict(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
-; CHECK-NEXT:    ret float [[R]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
  %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
@ -180,8 +201,15 @@ entry:
 define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
 ; CHECK-LABEL: @fmul_f32_strict_accum(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
-; CHECK-NEXT:    ret float [[R]]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
+; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
+; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[BIN_RDX3]]
 ;
 entry:
  %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)