diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 60957d2c314..ffd07b6afce 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -213,10 +213,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_ScalarOptimizerLate, MPM); - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); - MPM.add(createEarlyCSEPass()); - } + if (SLPVectorize) + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. if (BBVectorize) { MPM.add(createBBVectorizePass()); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index ea33801fd21..6d4c36aacdc 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/Verifier.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -45,13 +46,13 @@ SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, namespace { /// The SLPVectorizer Pass. -struct SLPVectorizer : public BasicBlockPass { +struct SLPVectorizer : public FunctionPass { typedef std::map StoreListMap; /// Pass identification, replacement for typeid static char ID; - explicit SLPVectorizer() : BasicBlockPass(ID) { + explicit SLPVectorizer() : FunctionPass(ID) { initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); } @@ -59,183 +60,257 @@ struct SLPVectorizer : public BasicBlockPass { DataLayout *DL; TargetTransformInfo *TTI; AliasAnalysis *AA; + LoopInfo *LI; - /// \brief Collect memory references and sort them according to their base - /// object. We sort the stores to their base objects to reduce the cost of the - /// quadratic search on the stores. TODO: We can further reduce this cost - /// if we flush the chain creation every time we run into a memory barrier. - bool collectStores(BasicBlock *BB, BoUpSLP &R) { - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - StoreInst *SI = dyn_cast(it); - if (!SI) - continue; - - // Check that the pointer points to scalars. - if (SI->getValueOperand()->getType()->isAggregateType()) - return false; - - // Find the base of the GEP. - Value *Ptr = SI->getPointerOperand(); - if (GetElementPtrInst *GEP = dyn_cast(Ptr)) - Ptr = GEP->getPointerOperand(); - - // Save the store locations. - StoreRefs[Ptr].push_back(SI); - } - return true; - } - - bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { - if (!A || !B) return false; - BoUpSLP::ValueList VL; - VL.push_back(A); - VL.push_back(B); - int Cost = R.getTreeCost(VL); - int ExtrCost = R.getScalarizationCost(VL); - DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << - " Cost of extract:" << ExtrCost << ".\n"); - if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; - DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); - return true; - } - - bool tryToVectorize(BinaryOperator *V, BoUpSLP &R) { - if (!V) return false; - // Try to vectorize V. - if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) - return true; - - BinaryOperator *A = dyn_cast(V->getOperand(0)); - BinaryOperator *B = dyn_cast(V->getOperand(1)); - // Try to skip B. - if (B && B->hasOneUse()) { - BinaryOperator *B0 = dyn_cast(B->getOperand(0)); - BinaryOperator *B1 = dyn_cast(B->getOperand(1)); - if (tryToVectorizePair(A, B0, R)) { - B->moveBefore(V); - return true; - } - if (tryToVectorizePair(A, B1, R)) { - B->moveBefore(V); - return true; - } - } - - // Try to slip A. - if (A && A->hasOneUse()) { - BinaryOperator *A0 = dyn_cast(A->getOperand(0)); - BinaryOperator *A1 = dyn_cast(A->getOperand(1)); - if (tryToVectorizePair(A0, B, R)) { - A->moveBefore(V); - return true; - } - if (tryToVectorizePair(A1, B, R)) { - A->moveBefore(V); - return true; - } - } - return 0; - } - - bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { - bool Changed = false; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - if (isa(it)) continue; - - // Try to vectorize reductions that use PHINodes. - if (PHINode *P = dyn_cast(it)) { - // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : - (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : - 0)); - // Check if this is a Binary Operator. - BinaryOperator *BI = dyn_cast_or_null(Rdx); - if (!BI) - continue; - - Value *Inst = BI->getOperand(0); - if (Inst == P) Inst = BI->getOperand(1); - Changed |= tryToVectorize(dyn_cast(Inst), R); - continue; - } - - // Try to vectorize trees that start at compare instructions. - if (CmpInst *CI = dyn_cast(it)) { - if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { - Changed |= true; - continue; - } - for (int i = 0; i < 2; ++i) - if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) - Changed |= tryToVectorize(BI, R); - continue; - } - } - - return Changed; - } - - bool vectorizeStoreChains(BoUpSLP &R) { - bool Changed = false; - // Attempt to sort and vectorize each of the store-groups. - for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); - it != e; ++it) { - if (it->second.size() < 2) - continue; - - DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << - it->second.size() << ".\n"); - - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); - } - return Changed; - } - - virtual bool runOnBasicBlock(BasicBlock &BB) { + virtual bool runOnFunction(Function &F) { SE = &getAnalysis(); DL = getAnalysisIfAvailable(); TTI = &getAnalysis(); AA = &getAnalysis(); + LI = &getAnalysis(); + StoreRefs.clear(); + bool Changed = false; // Must have DataLayout. We can't require it because some tests run w/o // triple. if (!DL) return false; - // Use the bollom up slp vectorizer to construct chains that start with - // he store instructions. - BoUpSLP R(&BB, SE, DL, TTI, AA); + for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { + BasicBlock *BB = it; + bool BBChanged = false; - // Vectorize trees that end at reductions. - bool Changed = vectorizeReductions(&BB, R); + // Use the bollom up slp vectorizer to construct chains that start with + // he store instructions. + BoUpSLP R(BB, SE, DL, TTI, AA); - // Vectorize trees that end at stores. - if (collectStores(&BB, R)) { - DEBUG(dbgs()<<"SLP: Found stores to vectorize.\n"); - Changed |= vectorizeStoreChains(R); + // Vectorize trees that end at reductions. + BBChanged |= vectorizeReductions(BB, R); + + // Vectorize trees that end at stores. + if (collectStores(BB, R)) { + DEBUG(dbgs()<<"SLP: Found stores to vectorize.\n"); + BBChanged |= vectorizeStoreChains(R); + } + + // Try to hoist some of the scalarization code to the preheader. + if (BBChanged) hoistGatherSequence(LI, BB, R); + + Changed |= BBChanged; } if (Changed) { - DEBUG(dbgs()<<"SLP: vectorized \""<getName()<<"\"\n"); - DEBUG(verifyFunction(*BB.getParent())); + DEBUG(dbgs()<<"SLP: vectorized \""<(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); } +private: + + /// \brief Collect memory references and sort them according to their base + /// object. We sort the stores to their base objects to reduce the cost of the + /// quadratic search on the stores. TODO: We can further reduce this cost + /// if we flush the chain creation every time we run into a memory barrier. + bool collectStores(BasicBlock *BB, BoUpSLP &R); + + /// \brief Try to vectorize a chain that starts at two arithmetic instrs. + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + + /// \brief Try to vectorize a chain that may start at the operands of \V; + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); + + /// \brief Vectorize the stores that were collected in StoreRefs. + bool vectorizeStoreChains(BoUpSLP &R); + + /// \brief Try to hoist gather sequences outside of the loop in cases where + /// all of the sources are loop invariant. + void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); + + /// \brief Scan the basic block and look for reductions that may start a + /// vectorization chain. + bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R); + private: StoreListMap StoreRefs; }; +bool SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { + StoreRefs.clear(); + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + StoreInst *SI = dyn_cast(it); + if (!SI) + continue; + + // Check that the pointer points to scalars. + if (SI->getValueOperand()->getType()->isAggregateType()) + return false; + + // Find the base of the GEP. + Value *Ptr = SI->getPointerOperand(); + if (GetElementPtrInst *GEP = dyn_cast(Ptr)) + Ptr = GEP->getPointerOperand(); + + // Save the store locations. + StoreRefs[Ptr].push_back(SI); + } + return true; +} + +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { + if (!A || !B) return false; + BoUpSLP::ValueList VL; + VL.push_back(A); + VL.push_back(B); + int Cost = R.getTreeCost(VL); + int ExtrCost = R.getScalarizationCost(VL); + DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << + " Cost of extract:" << ExtrCost << ".\n"); + if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; + DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); + R.vectorizeArith(VL); + return true; +} + +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { + if (!V) return false; + // Try to vectorize V. + if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + return true; + + BinaryOperator *A = dyn_cast(V->getOperand(0)); + BinaryOperator *B = dyn_cast(V->getOperand(1)); + // Try to skip B. + if (B && B->hasOneUse()) { + BinaryOperator *B0 = dyn_cast(B->getOperand(0)); + BinaryOperator *B1 = dyn_cast(B->getOperand(1)); + if (tryToVectorizePair(A, B0, R)) { + B->moveBefore(V); + return true; + } + if (tryToVectorizePair(A, B1, R)) { + B->moveBefore(V); + return true; + } + } + + // Try to slip A. + if (A && A->hasOneUse()) { + BinaryOperator *A0 = dyn_cast(A->getOperand(0)); + BinaryOperator *A1 = dyn_cast(A->getOperand(1)); + if (tryToVectorizePair(A0, B, R)) { + A->moveBefore(V); + return true; + } + if (tryToVectorizePair(A1, B, R)) { + A->moveBefore(V); + return true; + } + } + return 0; +} + +bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + if (isa(it)) continue; + + // Try to vectorize reductions that use PHINodes. + if (PHINode *P = dyn_cast(it)) { + // Check that the PHI is a reduction PHI. + if (P->getNumIncomingValues() != 2) return Changed; + Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) : + (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : + 0)); + // Check if this is a Binary Operator. + BinaryOperator *BI = dyn_cast_or_null(Rdx); + if (!BI) + continue; + + Value *Inst = BI->getOperand(0); + if (Inst == P) Inst = BI->getOperand(1); + Changed |= tryToVectorize(dyn_cast(Inst), R); + continue; + } + + // Try to vectorize trees that start at compare instructions. + if (CmpInst *CI = dyn_cast(it)) { + if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { + Changed |= true; + continue; + } + for (int i = 0; i < 2; ++i) + if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) + Changed |= tryToVectorize(BI, R); + continue; + } + } + + return Changed; +} + +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { + bool Changed = false; + // Attempt to sort and vectorize each of the store-groups. + for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); + it != e; ++it) { + if (it->second.size() < 2) + continue; + + DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " << + it->second.size() << ".\n"); + + Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + } + return Changed; +} + +void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, + BoUpSLP &R) { + // Check if this block is inside a loop. + Loop *L = LI->getLoopFor(BB); + if (!L) + return; + + // Check if it has a preheader. + BasicBlock *PreHeader = L->getLoopPreheader(); + if (!PreHeader) + return; + + // Mark the insertion point for the block. + Instruction *Location = PreHeader->getTerminator(); + + BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions(); + for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end(); + it != e; ++it) { + InsertElementInst *Insert = dyn_cast(*it); + + // The InsertElement sequence can be simplified into a constant. + if (!Insert) + continue; + + // If the vector or the element that we insert into it are + // instructions that are defined in this basic block then we can't + // hoist this instruction. + Instruction *CurrVec = dyn_cast(Insert->getOperand(0)); + Instruction *NewElem = dyn_cast(Insert->getOperand(1)); + if (CurrVec && L->contains(CurrVec)) continue; + if (NewElem && L->contains(NewElem)) continue; + + // We can hoist this instruction. Move it to the pre-header. + Insert->moveBefore(Location); + } +} + } // end anonymous namespace char SLPVectorizer::ID = 0; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index c85711532dd..a69646336c8 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -511,8 +511,15 @@ Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) { Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) { IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); Value *Vec = UndefValue::get(Ty); - for (unsigned i=0; i < Ty->getNumElements(); ++i) + for (unsigned i=0; i < Ty->getNumElements(); ++i) { + // Generate the 'InsertElement' instruction. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + // Remember that this instruction is used as part of a 'gather' sequence. + // The caller of the bottom-up slp vectorizer can try to hoist the sequence + // if the users are outside of the basic block. + GatherInstructions.push_back(Vec); + } + return Vec; } diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index 03512bf8c34..fed5178b802 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -71,6 +71,11 @@ struct BoUpSLP { /// \brief Vectorize a group of scalars into a vector tree. void vectorizeArith(ValueList &Operands); + /// \returns the list of new instructions that were added in order to collect + /// scalars into vectors. This list can be used to further optimize the gather + /// sequences. + ValueList &getGatherSeqInstructions() {return GatherInstructions; } + private: /// \brief This method contains the recursive part of getTreeCost. int getTreeCost_rec(ValueList &VL, unsigned Depth); @@ -107,11 +112,11 @@ private: /// \returns a vector from a collection of scalars in \p VL. Value *Scalarize(ValueList &VL, VectorType *Ty); - + private: - // Maps instructions to numbers and back. + /// Maps instructions to numbers and back. SmallDenseMap InstrIdx; - // Maps integers to Instructions. + /// Maps integers to Instructions. std::vector InstrVec; // -- containers that are used during getTreeCost -- // @@ -121,21 +126,29 @@ private: /// NOTICE: The vectorization methods also use this set. ValueSet MustScalarize; - // Contains a list of values that are used outside the current tree. This - // set must be reset between runs. + /// Contains a list of values that are used outside the current tree. This + /// set must be reset between runs. ValueSet MultiUserVals; - // Maps values in the tree to the vector lanes that uses them. This map must - // be reset between runs of getCost. + /// Maps values in the tree to the vector lanes that uses them. This map must + /// be reset between runs of getCost. std::map LaneMap; - // A list of instructions to ignore while sinking - // memory instructions. This map must be reset between runs of getCost. + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. SmallPtrSet MemBarrierIgnoreList; - // -- containers that are used during vectorizeTree -- // - // Maps between the first scalar to the vector. This map must be reset between - // runs. + // -- Containers that are used during vectorizeTree -- // + + /// Maps between the first scalar to the vector. This map must be reset + ///between runs. DenseMap VectorizedValues; + // -- Containers that are used after vectorization by the caller -- // + + /// A list of instructions that are used when gathering scalars into vectors. + /// In many cases these instructions can be hoisted outside of the BB. + /// Iterating over this list is faster than calling LICM. + ValueList GatherInstructions; + // Analysis and block reference. BasicBlock *BB; ScalarEvolution *SE; diff --git a/test/Transforms/SLPVectorizer/X86/hoist.ll b/test/Transforms/SLPVectorizer/X86/hoist.ll new file mode 100644 index 00000000000..5074ceaaabd --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/hoist.ll @@ -0,0 +1,59 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.9.0" + +;int foo(int *A, int n, int k) { +; for (int i=0; i < 10000; i+=4) { +; A[i] += n; +; A[i+1] += k; +; A[i+2] += n; +; A[i+3] += k; +; } +;} + +; preheader: +;CHECK: entry +;CHECK-NEXT: insertelement +;CHECK-NEXT: insertelement +;CHECK-NEXT: insertelement +;CHECK-NEXT: insertelement +; loop body: +;CHECK: phi +;CHECK: load <4 x i32> +;CHECK: add <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.024 = phi i32 [ 0, %entry ], [ %add10, %for.body ] + %arrayidx = getelementptr inbounds i32* %A, i32 %i.024 + %0 = load i32* %arrayidx, align 4 + %add = add nsw i32 %0, %n + store i32 %add, i32* %arrayidx, align 4 + %add121 = or i32 %i.024, 1 + %arrayidx2 = getelementptr inbounds i32* %A, i32 %add121 + %1 = load i32* %arrayidx2, align 4 + %add3 = add nsw i32 %1, %k + store i32 %add3, i32* %arrayidx2, align 4 + %add422 = or i32 %i.024, 2 + %arrayidx5 = getelementptr inbounds i32* %A, i32 %add422 + %2 = load i32* %arrayidx5, align 4 + %add6 = add nsw i32 %2, %n + store i32 %add6, i32* %arrayidx5, align 4 + %add723 = or i32 %i.024, 3 + %arrayidx8 = getelementptr inbounds i32* %A, i32 %add723 + %3 = load i32* %arrayidx8, align 4 + %add9 = add nsw i32 %3, %k + store i32 %add9, i32* %arrayidx8, align 4 + %add10 = add nsw i32 %i.024, 4 + %cmp = icmp slt i32 %add10, 10000 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 undef +} +