1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

[LV] Avoid unnecessary IV scalar-to-vector-to-scalar conversions

This patch prevents increases in the number of instructions, pre-instcombine,
due to induction variable scalarization. An increase in instructions can lead
to an increase in the compile-time required to simplify the induction
variables. We now maintain a new map for scalarized induction variables to
prevent us from converting between the scalar and vector forms.

This patch should resolve compile-time regressions seen after r274627.

llvm-svn: 275419
This commit is contained in:
Matthew Simpson 2016-07-14 14:36:06 +00:00
parent 2cf597abfa
commit 9e14b27894
3 changed files with 118 additions and 69 deletions

View File

@ -403,12 +403,12 @@ protected:
/// to each vector element of Val. The sequence starts at StartIndex.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
/// Compute a step vector like the above function, but scalarize the
/// arithmetic instead. The results of the computation are inserted into a
/// new vector with VF elements. \p Val is the initial value, \p Step is the
/// size of the step, and \p StartIdx indicates the index of the increment
/// from which to start computing the steps.
Value *getScalarizedStepVector(Value *Val, int StartIdx, Value *Step);
/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
/// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
/// can be a truncate instruction).
void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
/// Create a vector induction phi node based on an existing scalar one. This
/// currently only works for integer induction variables with a constant
@ -417,11 +417,11 @@ protected:
void createVectorIntInductionPHI(const InductionDescriptor &II,
VectorParts &Entry, IntegerType *TruncType);
/// Widen an integer induction variable \p IV. If \p TruncType is provided,
/// the induction variable will first be truncated to the specified type. The
/// Widen an integer induction variable \p IV. If \p Trunc is provided, the
/// induction variable will first be truncated to the corresponding type. The
/// widened values are placed in \p Entry.
void widenIntInduction(PHINode *IV, VectorParts &Entry,
IntegerType *TruncType = nullptr);
TruncInst *Trunc = nullptr);
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
@ -572,6 +572,17 @@ protected:
PHINode *OldInduction;
/// Maps scalars to widened vectors.
ValueMap WidenMap;
/// A map of induction variables from the original loop to their
/// corresponding VF * UF scalarized values in the vectorized loop. The
/// purpose of ScalarIVMap is similar to that of WidenMap. Whereas WidenMap
/// maps original loop values to their vector versions in the new loop,
/// ScalarIVMap maps induction variables from the original loop that are not
/// vectorized to their scalar equivalents in the vector loop. Maintaining a
/// separate map for scalarized induction variables allows us to avoid
/// unnecessary scalar-to-vector-to-scalar conversions.
DenseMap<Value *, SmallVector<Value *, 8>> ScalarIVMap;
/// Store instructions that should be predicated, as a pair
/// <StoreInst, Predicate>
SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
@ -1889,7 +1900,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
}
void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
IntegerType *TruncType) {
TruncInst *Trunc) {
auto II = Legal->getInductionVars()->find(IV);
assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@ -1897,6 +1908,9 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
// If a truncate instruction was provided, get the smaller type.
auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr;
// The step of the induction.
Value *Step = nullptr;
@ -1939,20 +1953,21 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
}
}
// If an induction variable is only used for counting loop iterations or
// calculating addresses, it shouldn't be widened. Scalarize the step vector
// to give InstCombine a better chance of simplifying it.
if (VF > 1 && ValuesNotWidened->count(IV)) {
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getScalarizedStepVector(ScalarIV, VF * Part, Step);
return;
}
// Finally, splat the scalar induction variable, and build the necessary step
// vectors.
// Splat the scalar induction variable, and build the necessary step vectors.
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
// If an induction variable is only used for counting loop iterations or
// calculating addresses, it doesn't need to be widened. Create scalar steps
// that can be used by instructions we will later scalarize. Note that the
// addition of the scalar steps will not increase the number of instructions
// in the loop in the common case prior to InstCombine. We will be trading
// one vector extract for each scalar step.
if (VF > 1 && ValuesNotWidened->count(IV)) {
auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV;
buildScalarSteps(ScalarIV, Step, EntryVal);
}
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
@ -1983,27 +1998,25 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
return Builder.CreateAdd(Val, Step, "induction");
}
Value *InnerLoopVectorizer::getScalarizedStepVector(Value *Val, int StartIdx,
Value *Step) {
void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
Value *EntryVal) {
// We can't create a vector with less than two elements.
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one");
// Get the value type and ensure it and the step have the same integer type.
Type *ValTy = Val->getType()->getScalarType();
assert(ValTy->isIntegerTy() && ValTy == Step->getType() &&
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
"Val and Step should have the same integer type");
// Compute the scalarized step vector. We perform scalar arithmetic and then
// insert the results into the step vector.
Value *StepVector = UndefValue::get(ToVectorTy(ValTy, VF));
for (unsigned I = 0; I < VF; ++I) {
auto *Mul = Builder.CreateMul(ConstantInt::get(ValTy, StartIdx + I), Step);
auto *Add = Builder.CreateAdd(Val, Mul);
StepVector = Builder.CreateInsertElement(StepVector, Add, I);
}
return StepVector;
// Compute the scalar steps and save the results in ScalarIVMap.
for (unsigned Part = 0; Part < UF; ++Part)
for (unsigned I = 0; I < VF; ++I) {
auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + I);
auto *Mul = Builder.CreateMul(StartIdx, Step);
auto *Add = Builder.CreateAdd(ScalarIV, Mul);
ScalarIVMap[EntryVal].push_back(Add);
}
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
@ -2459,8 +2472,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
"Must be last index or loop invariant");
VectorParts &GEPParts = getVectorValue(GepOperand);
Value *Index = GEPParts[0];
Index = Builder.CreateExtractElement(Index, Zero);
// If GepOperand is an induction variable, and there's a scalarized
// version of it available, use it. Otherwise, we will need to create
// an extractelement instruction.
Value *Index = ScalarIVMap.count(GepOperand)
? ScalarIVMap[GepOperand][0]
: Builder.CreateExtractElement(GEPParts[0], Zero);
Gep2->setOperand(i, Index);
Gep2->setName("gep.indvar.idx");
}
@ -2663,11 +2682,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
Cloned->setName(Instr->getName() + ".cloned");
// Replace the operands of the cloned instructions with extracted scalars.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
// Param is a vector. Need to extract the right lane.
if (Op->getType()->isVectorTy())
Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
Cloned->setOperand(op, Op);
// If the operand is an induction variable, and there's a scalarized
// version of it available, use it. Otherwise, we will need to create
// an extractelement instruction if vectorizing.
auto *NewOp = Params[op][Part];
auto *ScalarOp = Instr->getOperand(op);
if (ScalarIVMap.count(ScalarOp))
NewOp = ScalarIVMap[ScalarOp][VF * Part + Width];
else if (NewOp->getType()->isVectorTy())
NewOp = Builder.CreateExtractElement(NewOp, Builder.getInt32(Width));
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@ -4160,8 +4185,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
auto ID = Legal->getInductionVars()->lookup(OldInduction);
if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
ID.getConstIntStepValue()) {
auto *TruncType = cast<IntegerType>(CI->getType());
widenIntInduction(OldInduction, Entry, TruncType);
widenIntInduction(OldInduction, Entry, cast<TruncInst>(CI));
addMetadata(Entry, &I);
break;
}

View File

@ -1,6 +1,7 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@ -73,6 +74,26 @@ loopexit:
; for (int i = 0; i < n; ++i)
; sum += a[i];
;
; CHECK-LABEL: @scalarize_induction_variable_01(
; CHECK: vector.body:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %[[i0:.+]] = add i64 %index, 0
; CHECK: %[[i1:.+]] = add i64 %index, 1
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
;
; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; UNROLL-NO-IC: %[[i0:.+]] = add i64 %index, 0
; UNROLL-NO-IC: %[[i1:.+]] = add i64 %index, 1
; UNROLL-NO-IC: %[[i2:.+]] = add i64 %index, 2
; UNROLL-NO-IC: %[[i3:.+]] = add i64 %index, 3
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i0]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i1]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i2]]
; UNROLL-NO-IC: getelementptr inbounds i64, i64* %a, i64 %[[i3]]
;
; IND-LABEL: @scalarize_induction_variable_01(
; IND: vector.body:
; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
@ -112,6 +133,34 @@ for.end:
; for (int i ; 0; i < n; i += 8)
; s += (a[i] + b[i] + 1.0f);
;
; CHECK-LABEL: @scalarize_induction_variable_02(
; CHECK: vector.body:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = shl i64 %index, 3
; CHECK: %[[i0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[i1:.+]] = add i64 %offset.idx, 8
; CHECK: getelementptr inbounds float, float* %a, i64 %[[i0]]
; CHECK: getelementptr inbounds float, float* %a, i64 %[[i1]]
; CHECK: getelementptr inbounds float, float* %b, i64 %[[i0]]
; CHECK: getelementptr inbounds float, float* %b, i64 %[[i1]]
;
; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02(
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; UNROLL-NO-IC: %offset.idx = shl i64 %index, 3
; UNROLL-NO-IC: %[[i0:.+]] = add i64 %offset.idx, 0
; UNROLL-NO-IC: %[[i1:.+]] = add i64 %offset.idx, 8
; UNROLL-NO-IC: %[[i2:.+]] = add i64 %offset.idx, 16
; UNROLL-NO-IC: %[[i3:.+]] = add i64 %offset.idx, 24
; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i0]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i1]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i2]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %a, i64 %[[i3]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i0]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i1]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i2]]
; UNROLL-NO-IC: getelementptr inbounds float, float* %b, i64 %[[i3]]
;
; IND-LABEL: @scalarize_induction_variable_02(
; IND: vector.body:
; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]

View File

@ -8,21 +8,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i64 %startval, %index
; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0
; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1
; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2
; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3
; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0
; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1
; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2
; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3
define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
entry:
@ -48,21 +40,13 @@ loopend:
; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i128 %startval, %index
; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
; CHECK: %[[v0:.+]] = insertelement <4 x i128> undef, i128 %[[a0]], i64 0
; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
; CHECK: %[[v1:.+]] = insertelement <4 x i128> %[[v0]], i128 %[[a1]], i64 1
; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
; CHECK: %[[v2:.+]] = insertelement <4 x i128> %[[v1]], i128 %[[a2]], i64 2
; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
; CHECK: %[[v3:.+]] = insertelement <4 x i128> %[[v2]], i128 %[[a3]], i64 3
; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
; CHECK: %[[v4:.+]] = insertelement <4 x i128> undef, i128 %[[a4]], i64 0
; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
; CHECK: %[[v5:.+]] = insertelement <4 x i128> %[[v4]], i128 %[[a5]], i64 1
; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
; CHECK: %[[v6:.+]] = insertelement <4 x i128> %[[v5]], i128 %[[a6]], i64 2
; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7
; CHECK: %[[v7:.+]] = insertelement <4 x i128> %[[v6]], i128 %[[a7]], i64 3
define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
entry:
@ -88,21 +72,13 @@ loopend:
; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %offset.idx = sub i16 %startval, {{.*}}
; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
; CHECK: %[[v0:.+]] = insertelement <4 x i16> undef, i16 %[[a0]], i64 0
; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
; CHECK: %[[v1:.+]] = insertelement <4 x i16> %[[v0]], i16 %[[a1]], i64 1
; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
; CHECK: %[[v2:.+]] = insertelement <4 x i16> %[[v1]], i16 %[[a2]], i64 2
; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
; CHECK: %[[v3:.+]] = insertelement <4 x i16> %[[v2]], i16 %[[a3]], i64 3
; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
; CHECK: %[[v4:.+]] = insertelement <4 x i16> undef, i16 %[[a4]], i64 0
; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
; CHECK: %[[v5:.+]] = insertelement <4 x i16> %[[v4]], i16 %[[a5]], i64 1
; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
; CHECK: %[[v6:.+]] = insertelement <4 x i16> %[[v5]], i16 %[[a6]], i64 2
; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7
; CHECK: %[[v7:.+]] = insertelement <4 x i16> %[[v6]], i16 %[[a7]], i64 3
define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
entry: