1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

LoopVectorize: Implement partial loop unrolling when vectorization is not profitable.

This patch enables unrolling of loops when vectorization is legal but not profitable.
We add a new class InnerLoopUnroller, that extends InnerLoopVectorizer and replaces some of the vector-specific logic with scalars.

This patch does not introduce any runtime regressions and improves the following workloads:

SingleSource/Benchmarks/Shootout/matrix -22.64%
SingleSource/Benchmarks/Shootout-C++/matrix -13.06%
External/SPEC/CINT2006/464_h264ref/464_h264ref  -3.99%
SingleSource/Benchmarks/Adobe-C++/simple_types_constant_folding -1.95%

llvm-svn: 189281
This commit is contained in:
Nadav Rotem 2013-08-26 22:33:26 +00:00
parent 2ad50e72ed
commit 7d4e24e1f4
2 changed files with 534 additions and 167 deletions

View File

@ -126,6 +126,9 @@ static const unsigned MaxVectorWidth = 64;
/// Maximum vectorization unroll count.
static const unsigned MaxUnrollFactor = 16;
/// The cost of a loop that is considered 'small' by the unroller.
static const unsigned SmallLoopCost = 20;
namespace {
// Forward declarations.
@ -167,7 +170,9 @@ public:
updateAnalysis();
}
private:
virtual ~InnerLoopVectorizer() {}
protected:
/// A small list of PHINodes.
typedef SmallVector<PHINode*, 4> PhiVector;
/// When we unroll loops we have multiple vector values for each scalar.
@ -187,7 +192,13 @@ private:
/// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop.
void vectorizeLoop(LoopVectorizationLegality *Legal);
virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
/// \brief The Loop exit block may have single value PHI nodes where the
/// incoming value is 'Undef'. While vectorizing we only handled real values
/// that were defined inside the loop. Here we fix the 'undef case'.
/// See PR14725.
void fixLCSSAPHIs();
/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
@ -201,16 +212,23 @@ private:
void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
PhiVector *PV);
/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
LoopVectorizationLegality *Legal,
unsigned UF, unsigned VF, PhiVector *PV);
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
/// of scalars.
void scalarizeInstruction(Instruction *Instr);
virtual void scalarizeInstruction(Instruction *Instr);
/// Vectorize Load and Store instructions,
void vectorizeMemoryInstruction(Instruction *Instr,
virtual void vectorizeMemoryInstruction(Instruction *Instr,
LoopVectorizationLegality *Legal);
/// Create a broadcast instruction. This method generates a broadcast
@ -218,12 +236,12 @@ private:
/// value. If this is the induction variable then we extend it to N, N+1, ...
/// this is needed because each iteration in the loop corresponds to a SIMD
/// element.
Value *getBroadcastInstrs(Value *V);
virtual Value *getBroadcastInstrs(Value *V);
/// This function adds 0, 1, 2 ... to each vector element, starting at zero.
/// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
/// The sequence starts at StartIndex.
Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
@ -233,7 +251,7 @@ private:
VectorParts &getVectorValue(Value *V);
/// Generate a shuffle sequence that will reverse the vector Vec.
Value *reverseVector(Value *Vec);
virtual Value *reverseVector(Value *Vec);
/// This is a helper class that holds the vectorizer state. It maps scalar
/// instructions to vector instructions. When the code is 'unrolled' then
@ -291,6 +309,8 @@ private:
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;
protected:
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
@ -326,6 +346,23 @@ private:
EdgeMaskCache MaskCache;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT, DataLayout *DL,
const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
private:
virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
virtual void scalarizeInstruction(Instruction *Instr);
virtual void vectorizeMemoryInstruction(Instruction *Instr,
LoopVectorizationLegality *Legal);
virtual Value *getBroadcastInstrs(Value *V);
virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
virtual Value *reverseVector(Value *Vec);
};
/// \brief Look for a meaningful debug location on the instruction or it's
/// operands.
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
@ -875,7 +912,7 @@ struct LoopVectorize : public LoopPass {
LoopVectorizeHints Hints(L);
if (Hints.Width == 1) {
if (Hints.Width == 1 && Hints.Unroll == 1) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
@ -914,16 +951,23 @@ struct LoopVectorize : public LoopPass {
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
return false;
}
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
F->getParent()->getModuleIdentifier()<<"\n");
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
// If we decided that it is *legal* to vectorize the loop then do it.
InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
if (VF.Width == 1) {
if (UF == 1)
return false;
// We decided not to vectorize, but we may want to unroll.
InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
Unroller.vectorize(&LVL);
} else {
// If we decided that it is *legal* to vectorize the loop then do it.
InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
}
// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized(L);
@ -2136,11 +2180,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
fixLCSSAPHIs();
}
// The Loop exit block may have single value PHI nodes where the incoming
// value is 'undef'. While vectorizing we only handled real values that
// were defined inside the loop. Here we handle the 'undef case'.
// See PR14725.
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
@ -2149,7 +2193,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
LoopMiddleBlock);
}
}
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
@ -2210,6 +2254,170 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
return BlockMask;
}
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
InnerLoopVectorizer::VectorParts &Entry,
LoopVectorizationLegality *Legal,
unsigned UF, unsigned VF, PhiVector *PV) {
PHINode* P = cast<PHINode>(PN);
// Handle reduction variables:
if (Legal->getReductionVars()->count(P)) {
for (unsigned part = 0; part < UF; ++part) {
// This is phase one of vectorizing PHIs.
Type *VecTy = (VF == 1) ? PN->getType() :
VectorType::get(PN->getType(), VF);
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
LoopVectorBody-> getFirstInsertionPt());
}
PV->push_back(P);
return;
}
setDebugLocFromInst(Builder, P);
// Check for PHI nodes that are lowered to vector selects.
if (P->getParent() != OrigLoop->getHeader()) {
// We know that all PHIs in non header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
unsigned NumIncoming = P->getNumIncomingValues();
// Generate a sequence of selects of the form:
// SELECT(Mask3, In3,
// SELECT(Mask2, In2,
// ( ...)))
for (unsigned In = 0; In < NumIncoming; In++) {
VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
P->getParent());
VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
for (unsigned part = 0; part < UF; ++part) {
// We might have single edge PHIs (blocks) - use an identity
// 'select' for the first PHI operand.
if (In == 0)
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
In0[part]);
else
// Select between the current value and the previous incoming edge
// based on the incoming mask.
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
Entry[part], "predphi");
}
}
return;
}
// This PHINode must be an induction variable.
// Make sure that we know about it.
assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
LoopVectorizationLegality::InductionInfo II =
Legal->getInductionVars()->lookup(P);
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
assert(P->getType() == II.StartValue->getType() && "Types must match");
Type *PhiTy = P->getType();
Value *Broadcasted;
if (P == OldInduction) {
// Handle the canonical induction variable. We might have had to
// extend the type.
Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
} else {
// Handle other induction variables that are now based on the
// canonical one.
Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
"normalized.idx");
NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
"offset.idx");
}
Broadcasted = getBroadcastInstrs(Broadcasted);
// After broadcasting the induction variable we need to make the vector
// consecutive by adding 0, 1, 2, etc.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
return;
}
case LoopVectorizationLegality::IK_ReverseIntInduction:
case LoopVectorizationLegality::IK_PtrInduction:
case LoopVectorizationLegality::IK_ReversePtrInduction:
// Handle reverse integer and pointer inductions.
Value *StartIdx = ExtendedIdx;
// This is the normalized GEP that starts counting at zero.
Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
"normalized.idx");
// Handle the reverse integer induction variable case.
if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
"resize.norm.idx");
Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
"reverse.idx");
// This is a new value so do not hoist it out.
Value *Broadcasted = getBroadcastInstrs(ReverseInd);
// After broadcasting the induction variable we need to make the
// vector consecutive by adding ... -3, -2, -1, 0.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
true);
return;
}
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
// Is this a reverse induction ptr or a consecutive induction ptr.
bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
II.IK);
// This is the vector of results. Notice that we don't generate
// vector geps because scalar geps result in better code.
for (unsigned part = 0; part < UF; ++part) {
if (VF == 1) {
int EltIndex = (part) * (Reverse ? -1 : 1);
Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
Value *GlobalIdx;
if (Reverse)
GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
else
GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
"next.gep");
Entry[part] = SclrGep;
continue;
}
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
Value *GlobalIdx;
if (!Reverse)
GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
else
GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
"next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
Builder.getInt32(i),
"insert.gep");
}
Entry[part] = VecVal;
}
return;
}
}
void
InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
BasicBlock *BB, PhiVector *PV) {
@ -2222,149 +2430,9 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
// loop control flow instructions.
continue;
case Instruction::PHI:{
PHINode* P = cast<PHINode>(it);
// Handle reduction variables:
if (Legal->getReductionVars()->count(P)) {
for (unsigned part = 0; part < UF; ++part) {
// This is phase one of vectorizing PHIs.
Type *VecTy = VectorType::get(it->getType(), VF);
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
LoopVectorBody-> getFirstInsertionPt());
}
PV->push_back(P);
continue;
}
setDebugLocFromInst(Builder, P);
// Check for PHI nodes that are lowered to vector selects.
if (P->getParent() != OrigLoop->getHeader()) {
// We know that all PHIs in non header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
unsigned NumIncoming = P->getNumIncomingValues();
// Generate a sequence of selects of the form:
// SELECT(Mask3, In3,
// SELECT(Mask2, In2,
// ( ...)))
for (unsigned In = 0; In < NumIncoming; In++) {
VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
P->getParent());
VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
for (unsigned part = 0; part < UF; ++part) {
// We might have single edge PHIs (blocks) - use an identity
// 'select' for the first PHI operand.
if (In == 0)
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
In0[part]);
else
// Select between the current value and the previous incoming edge
// based on the incoming mask.
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
Entry[part], "predphi");
}
}
continue;
}
// This PHINode must be an induction variable.
// Make sure that we know about it.
assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
LoopVectorizationLegality::InductionInfo II =
Legal->getInductionVars()->lookup(P);
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
assert(P->getType() == II.StartValue->getType() && "Types must match");
Type *PhiTy = P->getType();
Value *Broadcasted;
if (P == OldInduction) {
// Handle the canonical induction variable. We might have had to
// extend the type.
Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
} else {
// Handle other induction variables that are now based on the
// canonical one.
Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
"normalized.idx");
NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
"offset.idx");
}
Broadcasted = getBroadcastInstrs(Broadcasted);
// After broadcasting the induction variable we need to make the vector
// consecutive by adding 0, 1, 2, etc.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
continue;
}
case LoopVectorizationLegality::IK_ReverseIntInduction:
case LoopVectorizationLegality::IK_PtrInduction:
case LoopVectorizationLegality::IK_ReversePtrInduction:
// Handle reverse integer and pointer inductions.
Value *StartIdx = ExtendedIdx;
// This is the normalized GEP that starts counting at zero.
Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
"normalized.idx");
// Handle the reverse integer induction variable case.
if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
"resize.norm.idx");
Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
"reverse.idx");
// This is a new value so do not hoist it out.
Value *Broadcasted = getBroadcastInstrs(ReverseInd);
// After broadcasting the induction variable we need to make the
// vector consecutive by adding ... -3, -2, -1, 0.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
true);
continue;
}
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
// Is this a reverse induction ptr or a consecutive induction ptr.
bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
II.IK);
// This is the vector of results. Notice that we don't generate
// vector geps because scalar geps result in better code.
for (unsigned part = 0; part < UF; ++part) {
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
Value *GlobalIdx;
if (!Reverse)
GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
else
GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
"next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
Builder.getInt32(i),
"insert.gep");
}
Entry[part] = VecVal;
}
continue;
}
// Vectorize PHINodes.
widenPHIInstruction(it, Entry, Legal, UF, VF, PV);
continue;
}// End of PHI.
case Instruction::Add:
@ -2423,8 +2491,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
VectorParts &Cond = getVectorValue(it->getOperand(0));
VectorParts &Op0 = getVectorValue(it->getOperand(1));
VectorParts &Op1 = getVectorValue(it->getOperand(2));
Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
Builder.getInt32(0));
Value *ScalarCond = (VF == 1) ? Cond[0] :
Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = Builder.CreateSelect(
InvariantCond ? ScalarCond : Cond[Part],
@ -2485,7 +2555,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
break;
}
/// Vectorize casts.
Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
Type *DestTy = (VF == 1) ? CI->getType() :
VectorType::get(CI->getType(), VF);
VectorParts &A = getVectorValue(it->getOperand(0));
for (unsigned Part = 0; Part < UF; ++Part)
@ -2515,7 +2586,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
Type *Tys[] = {CI->getType()};
if (VF > 1)
Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
Function *F = Intrinsic::getDeclaration(M, ID, Tys);
Entry[Part] = Builder.CreateCall(F, Args);
}
@ -4267,7 +4341,19 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
else if (UF < 1)
UF = 1;
if (Legal->getReductionVars()->size()) {
bool HasReductions = Legal->getReductionVars()->size();
// Decide if we want to unroll if we decided that it is legal to vectorize
// but not profitable.
if (VF == 1) {
if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
LoopCost > SmallLoopCost)
return 1;
return UF;
}
if (HasReductions) {
DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
return UF;
}
@ -4277,9 +4363,9 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
// to estimate the cost of the loop and unroll until the cost of the
// loop overhead is about 5% of the cost of the loop.
DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
if (LoopCost < 20) {
if (LoopCost < SmallLoopCost) {
DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
unsigned NewUF = 20/LoopCost + 1;
unsigned NewUF = SmallLoopCost / (LoopCost + 1);
return std::min(NewUF, UF);
}
@ -4701,3 +4787,245 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
return false;
}
void
InnerLoopUnroller::vectorizeLoop(LoopVectorizationLegality *Legal) {
// In order to support reduction variables we need to be able to unroll
// Phi nodes. Phi nodes have cycles, so we need to unroll them in two
// stages. See InnerLoopVectorizer::vectorizeLoop for more details.
PhiVector RdxPHIsToFix;
// Scan the loop in a topological order to ensure that defs are vectorized
// before users.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
// Unroll all of the blocks in the original loop.
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO();
bb != be; ++bb)
vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
// Create the 'reduced' values for each of the induction vars.
// The reduced values are the vector values that we scalarize and combine
// after the loop is finished.
for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
it != e; ++it) {
PHINode *RdxPhi = *it;
assert(RdxPhi && "Unable to recover vectorized PHI");
// Find the reduction variable descriptor.
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
LoopVectorizationLegality::ReductionDescriptor RdxDesc =
(*Legal->getReductionVars())[RdxPhi];
setDebugLocFromInst(Builder, RdxDesc.StartValue);
// We need to generate a reduction vector from the incoming scalar.
// To do so, we need to generate the 'identity' vector and overide
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
// This is the vector-clone of the value that leaves the loop.
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
Type *VecTy = VectorExit[0]->getType();
// Find the reduction identity variable. Zero for addition, or, xor,
// one for multiplication, -1 for And.
Value *Identity;
Value *VectorStart;
if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
// MinMax reduction have the start value as their identify.
VectorStart = Identity = RdxDesc.StartValue;
} else {
Identity = LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
VecTy->getScalarType());
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
VectorStart = RdxDesc.StartValue;
}
// Fix the vector-loop phi.
// We created the induction variable so we know that the
// preheader is the first entry.
BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
VectorParts &Val = getVectorValue(LoopVal);
for (unsigned part = 0; part < UF; ++part) {
// Make sure to add the reduction stat value only to the
// first unroll part.
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
}
// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
VectorParts RdxParts;
setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
for (unsigned part = 0; part < UF; ++part) {
// This PHINode contains the vectorized reduction variable, or
// the initial value vector, if we bypass the vector loop.
VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
Value *StartVal = (part == 0) ? VectorStart : Identity;
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
RdxParts.push_back(NewPhi);
}
// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = RdxParts[0];
unsigned Op = getReductionBinOp(RdxDesc.Kind);
setDebugLocFromInst(Builder, ReducedPartRdx);
for (unsigned part = 1; part < UF; ++part) {
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
RdxParts[part], ReducedPartRdx,
"bin.rdx");
else
ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
ReducedPartRdx, RdxParts[part]);
}
// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
// We know that the loop is in LCSSA form. We need to update the
// PHI nodes in the exit blocks.
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) continue;
// All PHINodes need to have a single entry edge, or two if
// we already fixed them.
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
// We found our reduction value exit-PHI. Update it with the
// incoming bypass edge.
if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
// Add an edge coming from the bypass.
LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
break;
}
}// end of the LCSSA phi scan.
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
fixLCSSAPHIs();
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
// Find all of the vectorized parameters.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *SrcOp = Instr->getOperand(op);
// If we are accessing the old induction variable, use the new one.
if (SrcOp == OldInduction) {
Params.push_back(getVectorValue(SrcOp));
continue;
}
// Try using previously calculated values.
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
// If the src is an instruction that appeared earlier in the basic block
// then it should already be vectorized.
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
// The parameter is a vector value from earlier.
Params.push_back(WidenMap.get(SrcInst));
} else {
// The parameter is a scalar from outside the loop. Maybe even a constant.
VectorParts Scalars;
Scalars.append(UF, SrcOp);
Params.push_back(Scalars);
}
}
assert(Params.size() == Instr->getNumOperands() &&
"Invalid number of operands");
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Value *UndefVec = IsVoidRetTy ? 0 :
UndefValue::get(Instr->getType());
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
// For each scalar that we create:
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
// Replace the operands of the cloned instrucions with extracted scalars.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
Cloned->setOperand(op, Op);
}
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
// If the original scalar returns a value we need to place it in a vector
// so that future users will be able to use it.
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
}
}
void
InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr,
LoopVectorizationLegality*) {
return scalarizeInstruction(Instr);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) {
return Vec;
}
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
return V;
}
Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
bool Negate) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
Type *ITy = Val->getType();
assert(!ITy->isVectorTy() && "Val must be a scalar");
Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
return Builder.CreateAdd(Val, C, "induction");
}

View File

@ -0,0 +1,39 @@
; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@a = common global [2048 x i32] zeroinitializer, align 16
; This is the loop.
; for (i=0; i<n; i++){
; a[i] += i;
; }
;CHECK-LABEL: @inc(
;CHECK: load i32*
;CHECK: load i32*
;CHECK: add nsw i32
;CHECK: add nsw i32
;CHECK: store i32
;CHECK: store i32
;CHECK: ret void
define void @inc(i32 %n) nounwind uwtable noinline ssp {
%1 = icmp sgt i32 %n, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0, %.lr.ph
%indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
%2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
%3 = load i32* %2, align 4
%4 = trunc i64 %indvars.iv to i32
%5 = add nsw i32 %3, %4
store i32 %5, i32* %2, align 4
%indvars.iv.next = add i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %n
br i1 %exitcond, label %._crit_edge, label %.lr.ph
._crit_edge: ; preds = %.lr.ph, %0
ret void
}