1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

Support generic expansion of ordered vector reduction (PR36732)

Without the fast math flags, the llvm.experimental.vector.reduce.fadd/fmul intrinsic expansions must be expanded in order.

This patch scalarizes the reduction, applying the accumulator at the start of the sequence: ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[NumElts-1]

Differential Revision: https://reviews.llvm.org/D45366

llvm-svn: 329585
This commit is contained in:
Simon Pilgrim 2018-04-09 15:44:20 +00:00
parent 9ce45136a8
commit 668d706414
4 changed files with 84 additions and 14 deletions

View File

@ -509,6 +509,13 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
LoopSafetyInfo *SafetyInfo,
OptimizationRemarkEmitter *ORE = nullptr);
/// Generates an ordered vector reduction using extracts to reduce the value.
Value *
getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
RecurrenceDescriptor::MRK_Invalid,
ArrayRef<Value *> RedOps = None);
/// Generates a vector reduction using shufflevectors to reduce the value.
Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind

View File

@ -85,6 +85,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
for (auto *II : Worklist) {
IRBuilder<> Builder(II);
bool IsOrdered = false;
Value *Acc = nullptr;
Value *Vec = nullptr;
auto ID = II->getIntrinsicID();
auto MRK = RecurrenceDescriptor::MRK_Invalid;
@ -92,11 +94,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
case Intrinsic::experimental_vector_reduce_fadd:
case Intrinsic::experimental_vector_reduce_fmul:
// FMFs must be attached to the call, otherwise it's an ordered reduction
// and it can't be handled by generating this shuffle sequence.
// TODO: Implement scalarization of ordered reductions here for targets
// without native support.
// and it can't be handled by generating a shuffle sequence.
if (!II->getFastMathFlags().isFast())
continue;
IsOrdered = true;
Acc = II->getArgOperand(0);
Vec = II->getArgOperand(1);
break;
case Intrinsic::experimental_vector_reduce_add:
@ -118,7 +119,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
}
if (!TTI->shouldExpandReduction(II))
continue;
auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
Value *Rdx =
IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK)
: getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
II->replaceAllUsesWith(Rdx);
II->eraseFromParent();
Changed = true;

View File

@ -1526,6 +1526,38 @@ static Value *addFastMathFlag(Value *V) {
return V;
}
// Helper to generate an ordered reduction.
Value *
llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
unsigned Op,
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
ArrayRef<Value *> RedOps) {
unsigned VF = Src->getType()->getVectorNumElements();
// Extract and apply reduction ops in ascending order:
// e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
Value *Result = Acc;
for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
Value *Ext =
Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
"bin.rdx");
} else {
assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
"Invalid min/max");
Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result,
Ext);
}
if (!RedOps.empty())
propagateIRFlags(Result, RedOps);
}
return Result;
}
// Helper to generate a log2 shuffle reduction.
Value *
llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,

View File

@ -117,8 +117,15 @@ entry:
define float @fadd_f32_strict(<4 x float> %vec) {
; CHECK-LABEL: @fadd_f32_strict(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
@ -128,8 +135,15 @@ entry:
define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {
; CHECK-LABEL: @fadd_f32_strict_accum(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec)
@ -169,8 +183,15 @@ entry:
define float @fmul_f32_strict(<4 x float> %vec) {
; CHECK-LABEL: @fmul_f32_strict(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
@ -180,8 +201,15 @@ entry:
define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {
; CHECK-LABEL: @fmul_f32_strict_accum(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]])
; CHECK-NEXT: ret float [[R]]
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0
; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1
; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3
; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]]
; CHECK-NEXT: ret float [[BIN_RDX3]]
;
entry:
%r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)