From 9d77533d54d24cc091cb08fc3af0e3e3c9c7e475 Mon Sep 17 00:00:00 2001 From: Haicheng Wu Date: Sat, 23 Jan 2016 06:52:41 +0000 Subject: [PATCH] [LIR] Add support for structs and hand unrolled loops Now LIR can turn following codes into memset: typedef struct foo { int a; int b; } foo_t; void bar(foo_t *f, unsigned n) { for (unsigned i = 0; i < n; ++i) { f[i].a = 0; f[i].b = 0; } } void test(foo_t *f, unsigned n) { for (unsigned i = 0; i < n; i += 2) { f[i] = 0; f[i+1] = 0; } } llvm-svn: 258620 --- include/llvm/Analysis/LoopAccessAnalysis.h | 5 + lib/Analysis/LoopAccessAnalysis.cpp | 72 ++++++ lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 246 +++++++++++++++---- lib/Transforms/Vectorize/SLPVectorizer.cpp | 89 +------ test/Transforms/LoopIdiom/struct.ll | 221 +++++++++++++++++ test/Transforms/LoopIdiom/struct_pattern.ll | 186 ++++++++++++++ test/Transforms/LoopIdiom/unroll.ll | 80 ++++++ 7 files changed, 775 insertions(+), 124 deletions(-) create mode 100644 test/Transforms/LoopIdiom/struct.ll create mode 100644 test/Transforms/LoopIdiom/struct_pattern.ll create mode 100644 test/Transforms/LoopIdiom/unroll.ll diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h index 871d35e99b7..7edd1cf73dc 100644 --- a/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/include/llvm/Analysis/LoopAccessAnalysis.h @@ -659,6 +659,11 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap); +/// \brief Returns true if the memory operations \p A and \p B are consecutive. +/// This is a simple API that does not depend on the analysis pass. +bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, + ScalarEvolution &SE, bool CheckType = true); + /// \brief This analysis provides dependence information for the memory accesses /// of a loop. /// diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp index 84f9fa69463..a2ab231a62d 100644 --- a/lib/Analysis/LoopAccessAnalysis.cpp +++ b/lib/Analysis/LoopAccessAnalysis.cpp @@ -901,6 +901,78 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, return Stride; } +/// Take the pointer operand from the Load/Store instruction. +/// Returns NULL if this is not a valid Load/Store instruction. +static Value *getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return nullptr; +} + +/// Take the address space operand from the Load/Store instruction. +/// Returns -1 if this is not a valid Load/Store instruction. +static unsigned getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +/// Returns true if the memory operations \p A and \p B are consecutive. +bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, + ScalarEvolution &SE, bool CheckType) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) + return false; + + // Make sure that A and B are different pointers. + if (PtrA == PtrB) + return false; + + // Make sure that A and B have the same type if required. + if(CheckType && PtrA->getType() != PtrB->getType()) + return false; + + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); + + APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + + // OffsetDelta = OffsetB - OffsetA; + const SCEV *OffsetSCEVA = SE.getConstant(OffsetA); + const SCEV *OffsetSCEVB = SE.getConstant(OffsetB); + const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA); + const SCEVConstant *OffsetDeltaC = dyn_cast(OffsetDeltaSCEV); + const APInt &OffsetDelta = OffsetDeltaC->getAPInt(); + // Check if they are based on the same pointer. That makes the offsets + // sufficient. + if (PtrA == PtrB) + return OffsetDelta == Size; + + // Compute the necessary base pointer delta to have the necessary final delta + // equal to the size. + // BaseDelta = Size - OffsetDelta; + const SCEV *SizeSCEV = SE.getConstant(Size); + const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV); + + // Otherwise compute the distance with SCEV between the base pointers. + const SCEV *PtrSCEVA = SE.getSCEV(PtrA); + const SCEV *PtrSCEVB = SE.getSCEV(PtrB); + const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta); + return X == PtrSCEVB; +} + bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { switch (Type) { case NoDep: diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 4521640e394..bb113ccf16e 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -26,22 +26,20 @@ // i64 and larger types when i64 is legal and the value has few bits set. It // would be good to enhance isel to emit a loop for ctpop in this case. // -// We should enhance the memset/memcpy recognition to handle multiple stores in -// the loop. This would handle things like: -// void foo(_Complex float *P) -// for (i) { __real__(*P) = 0; __imag__(*P) = 0; } -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -108,7 +106,9 @@ public: private: typedef SmallVector StoreList; - StoreList StoreRefsForMemset; + typedef MapVector StoreListMap; + StoreListMap StoreRefsForMemset; + StoreListMap StoreRefsForMemsetPattern; StoreList StoreRefsForMemcpy; bool HasMemset; bool HasMemsetPattern; @@ -122,14 +122,18 @@ private: SmallVectorImpl &ExitBlocks); void collectStores(BasicBlock *BB); - bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy); - bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern, + bool &ForMemcpy); + bool processLoopStores(SmallVectorImpl &SL, const SCEV *BECount, + bool ForMemset); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, Value *StoredVal, - Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount, bool NegStride); + Instruction *TheStore, + SmallPtrSetImpl &Stores, + const SCEVAddRecExpr *Ev, const SCEV *BECount, + bool NegStride); bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); /// @} @@ -305,7 +309,7 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { } bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, - bool &ForMemcpy) { + bool &ForMemsetPattern, bool &ForMemcpy) { // Don't touch volatile stores. if (!SI->isSimple()) return false; @@ -353,7 +357,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, StorePtr->getType()->getPointerAddressSpace() == 0 && (PatternValue = getMemSetPatternValue(StoredVal, DL))) { // It looks like we can use PatternValue! - ForMemset = true; + ForMemsetPattern = true; return true; } @@ -393,6 +397,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, void LoopIdiomRecognize::collectStores(BasicBlock *BB) { StoreRefsForMemset.clear(); + StoreRefsForMemsetPattern.clear(); StoreRefsForMemcpy.clear(); for (Instruction &I : *BB) { StoreInst *SI = dyn_cast(&I); @@ -400,15 +405,22 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { continue; bool ForMemset = false; + bool ForMemsetPattern = false; bool ForMemcpy = false; // Make sure this is a strided store with a constant stride. - if (!isLegalStore(SI, ForMemset, ForMemcpy)) + if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy)) continue; // Save the store locations. - if (ForMemset) - StoreRefsForMemset.push_back(SI); - else if (ForMemcpy) + if (ForMemset) { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemset[Ptr].push_back(SI); + } else if (ForMemsetPattern) { + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); + StoreRefsForMemsetPattern[Ptr].push_back(SI); + } else if (ForMemcpy) StoreRefsForMemcpy.push_back(SI); } } @@ -430,9 +442,14 @@ bool LoopIdiomRecognize::runOnLoopBlock( // Look for store instructions, which may be optimized to memset/memcpy. collectStores(BB); - // Look for a single store which can be optimized into a memset. - for (auto &SI : StoreRefsForMemset) - MadeChange |= processLoopStore(SI, BECount); + // Look for a single store or sets of stores with a common base, which can be + // optimized into a memset (memset_pattern). The latter most commonly happens + // with structs and handunrolled loops. + for (auto &SL : StoreRefsForMemset) + MadeChange |= processLoopStores(SL.second, BECount, true); + + for (auto &SL : StoreRefsForMemsetPattern) + MadeChange |= processLoopStores(SL.second, BECount, false); // Optimize the store into a memcpy, if it feeds an similarly strided load. for (auto &SI : StoreRefsForMemcpy) @@ -458,26 +475,155 @@ bool LoopIdiomRecognize::runOnLoopBlock( return MadeChange; } -/// processLoopStore - See if this store can be promoted to a memset. -bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - assert(SI->isSimple() && "Expected only non-volatile stores."); +/// processLoopStores - See if this store(s) can be promoted to a memset. +bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl &SL, + const SCEV *BECount, + bool ForMemset) { + // Try to find consecutive stores that can be transformed into memsets. + SetVector Heads, Tails; + SmallDenseMap ConsecutiveChain; - Value *StoredVal = SI->getValueOperand(); - Value *StorePtr = SI->getPointerOperand(); + // Do a quadratic search on all of the given stores and find + // all of the pairs of stores that follow each other. + SmallVector IndexQueue; + for (unsigned i = 0, e = SL.size(); i < e; ++i) { + assert(SL[i]->isSimple() && "Expected only non-volatile stores."); - // Check to see if the stride matches the size of the store. If so, then we - // know that every byte is touched in the loop. - const SCEVAddRecExpr *StoreEv = cast(SE->getSCEV(StorePtr)); - unsigned Stride = getStoreStride(StoreEv); - unsigned StoreSize = getStoreSizeInBytes(SI, DL); - if (StoreSize != Stride && StoreSize != -Stride) - return false; + Value *FirstStoredVal = SL[i]->getValueOperand(); + Value *FirstStorePtr = SL[i]->getPointerOperand(); + const SCEVAddRecExpr *FirstStoreEv = + cast(SE->getSCEV(FirstStorePtr)); + unsigned FirstStride = getStoreStride(FirstStoreEv); + unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL); - bool NegStride = StoreSize == -Stride; + // See if we can optimize just this store in isolation. + if (FirstStride == FirstStoreSize || FirstStride == -FirstStoreSize) { + Heads.insert(SL[i]); + continue; + } - // See if we can optimize just this store in isolation. - return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount, NegStride); + Value *FirstSplatValue = nullptr; + Constant *FirstPatternValue = nullptr; + + if (ForMemset) + FirstSplatValue = isBytewiseValue(FirstStoredVal); + else + FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); + + assert((FirstSplatValue || FirstPatternValue) && + "Expected either splat value or pattern value."); + + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find memset opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + assert(SL[k]->isSimple() && "Expected only non-volatile stores."); + Value *SecondStorePtr = SL[k]->getPointerOperand(); + const SCEVAddRecExpr *SecondStoreEv = + cast(SE->getSCEV(SecondStorePtr)); + unsigned SecondStride = getStoreStride(SecondStoreEv); + + if (FirstStride != SecondStride) + continue; + + Value *SecondStoredVal = SL[k]->getValueOperand(); + Value *SecondSplatValue = nullptr; + Constant *SecondPatternValue = nullptr; + + if (ForMemset) + SecondSplatValue = isBytewiseValue(SecondStoredVal); + else + SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); + + assert((SecondSplatValue || SecondPatternValue) && + "Expected either splat value or pattern value."); + + if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { + if (ForMemset) { + ConstantInt *C1 = dyn_cast(FirstSplatValue); + ConstantInt *C2 = dyn_cast(SecondSplatValue); + if (!C1 || !C2 || C1 != C2) + continue; + } else { + Constant *C1 = FirstPatternValue; + Constant *C2 = SecondPatternValue; + + if (ConstantArray *CA1 = dyn_cast(C1)) + C1 = CA1->getSplatValue(); + + if (ConstantArray *CA2 = dyn_cast(C2)) + C2 = CA2->getSplatValue(); + + if (C1 != C2) + continue; + } + Tails.insert(SL[k]); + Heads.insert(SL[i]); + ConsecutiveChain[SL[i]] = SL[k]; + break; + } + } + } + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we transformed so that we don't visit the same store twice. + SmallPtrSet TransformedStores; + bool Changed = false; + + // For stores that start but don't end a link in the chain: + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to transform it. + SmallPtrSet AdjacentStores; + StoreInst *I = *it; + + StoreInst *HeadStore = I; + unsigned StoreSize = 0; + + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (TransformedStores.count(I)) + break; + AdjacentStores.insert(I); + + StoreSize += getStoreSizeInBytes(I, DL); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + Value *StoredVal = HeadStore->getValueOperand(); + Value *StorePtr = HeadStore->getPointerOperand(); + const SCEVAddRecExpr *StoreEv = cast(SE->getSCEV(StorePtr)); + unsigned Stride = getStoreStride(StoreEv); + + // Check to see if the stride matches the size of the stores. If so, then + // we know that every byte is touched in the loop. + if (StoreSize != Stride && StoreSize != -Stride) + continue; + + bool NegStride = StoreSize == -Stride; + + if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), + StoredVal, HeadStore, AdjacentStores, StoreEv, + BECount, NegStride)) { + TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); + Changed = true; + } + } + + return Changed; } /// processLoopMemSet - See if this memset can be promoted to a large memset. @@ -520,18 +666,21 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) return false; + SmallPtrSet MSIs; + MSIs.insert(MSI); return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), SplatValue, MSI, Ev, + MSI->getAlignment(), SplatValue, MSI, MSIs, Ev, BECount, /*NegStride=*/false); } /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, - const SCEV *BECount, unsigned StoreSize, - AliasAnalysis &AA, - Instruction *IgnoredStore) { +static bool +mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, + SmallPtrSetImpl &IgnoredStores) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts // at the pointer and has infinite size. @@ -551,7 +700,8 @@ static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access)) + if (IgnoredStores.count(&*I) == 0 && + (AA.getModRefInfo(&*I, StoreLoc) & Access)) return true; return false; @@ -574,7 +724,8 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, - Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, + Value *StoredVal, Instruction *TheStore, + SmallPtrSetImpl &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride) { Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; @@ -609,7 +760,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *BasePtr = Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, - *AA, TheStore)) { + *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -662,7 +813,8 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, TLI); + for (auto *I : Stores) + deleteDeadInstruction(I, TLI); ++NumMemSet; return true; } @@ -714,8 +866,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *StoreBasePtr = Expander.expandCodeFor( StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + SmallPtrSet Stores; + Stores.insert(SI); if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, - StoreSize, *AA, SI)) { + StoreSize, *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); @@ -735,7 +889,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, - *AA, SI)) { + *AA, Stores)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2520c78b538..8989b13cccc 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -401,9 +402,6 @@ public: } } - /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL); - /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -438,14 +436,6 @@ private: /// vectorized, or NULL. They may happen in cycles. Value *alreadyVectorized(ArrayRef VL) const; - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); - - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); - /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getGatherCost(Type *Ty); @@ -1191,8 +1181,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { return; } - if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { - if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) { + if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL, *SE)) { ++NumLoadsWantToChangeOrder; } BS.cancelScheduling(VL); @@ -1364,7 +1354,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { const DataLayout &DL = F->getParent()->getDataLayout(); // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); @@ -1837,63 +1827,6 @@ int BoUpSLP::getGatherCost(ArrayRef VL) { return getGatherCost(VecTy); } -Value *BoUpSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast(I)) - return SI->getPointerOperand(); - return nullptr; -} - -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; -} - -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Make sure that A and B are different pointers of the same type. - if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) - return false; - - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); - Type *Ty = cast(PtrA->getType())->getElementType(); - APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); - - APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); - - APInt OffsetDelta = OffsetB - OffsetA; - - // Check if they are based on the same pointer. That makes the offsets - // sufficient. - if (PtrA == PtrB) - return OffsetDelta == Size; - - // Compute the necessary base pointer delta to have the necessary final delta - // equal to the size. - APInt BaseDelta = Size - OffsetDelta; - - // Otherwise compute the distance with SCEV between the base pointers. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *C = SE->getConstant(BaseDelta); - const SCEV *X = SE->getAddExpr(PtrSCEVA, C); - return X == PtrSCEVB; -} - // Reorder commutative operations in alternate shuffle if the resulting vectors // are consecutive loads. This would allow us to vectorize the tree. // If we have something like- @@ -1921,10 +1854,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, if (LoadInst *L1 = dyn_cast(Right[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); - if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) { std::swap(Left[j], Right[j]); continue; - } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -1935,10 +1868,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, if (LoadInst *L1 = dyn_cast(Left[j + 1])) { Instruction *VL1 = cast(VL[j]); Instruction *VL2 = cast(VL[j + 1]); - if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) { std::swap(Left[j], Right[j]); continue; - } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2088,7 +2021,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, for (unsigned j = 0; j < VL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - if (isConsecutiveAccess(L, L1, DL)) { + if (isConsecutiveAccess(L, L1, DL, *SE)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -2096,7 +2029,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - if (isConsecutiveAccess(L, L1, DL)) { + if (isConsecutiveAccess(L, L1, DL, *SE)) { std::swap(Left[j + 1], Right[j + 1]); continue; } @@ -3461,7 +3394,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, IndexQueue.push_back(j - 1); for (auto &k : IndexQueue) { - if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { + if (isConsecutiveAccess(Stores[i], Stores[k], DL, *SE)) { Tails.insert(Stores[k]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[k]; diff --git a/test/Transforms/LoopIdiom/struct.ll b/test/Transforms/LoopIdiom/struct.ll new file mode 100644 index 00000000000..2828024952e --- /dev/null +++ b/test/Transforms/LoopIdiom/struct.ll @@ -0,0 +1,221 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +target triple = "x86_64-apple-darwin10.0.0" + +%struct.foo = type { i32, i32 } +%struct.foo1 = type { i32, i32, i32 } +%struct.foo2 = type { i32, i16, i16 } + +;void bar1(foo_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 0; +; f[i].b = 0; +; } +;} +define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 0, i32* %b, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar1( +; CHECK: call void @llvm.memset +; CHECK-NOT: store +} + +;void bar2(foo_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].b = 0; +; f[i].a = 0; +; } +;} +define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 0, i32* %b, align 4 + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar2( +; CHECK: call void @llvm.memset +; CHECK-NOT: store +} + +;void bar3(foo_t *f, unsigned n) { +; for (unsigned i = n; i > 0; --i) { +; f[i].a = 0; +; f[i].b = 0; +; } +;} +define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 0, i32* %b, align 4 + %1 = trunc i64 %indvars.iv to i32 + %dec = add i32 %1, -1 + %cmp = icmp eq i32 %dec, 0 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar3( +; CHECK: call void @llvm.memset +; CHECK-NOT: store +} + +;void bar4(foo_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 0; +; f[i].b = 1; +; } +;} +define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 1, i32* %b, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar4( +; CHECK-NOT: call void @llvm.memset +} + +;void bar5(foo1_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 0; +; f[i].b = 0; +; } +;} +define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1 + store i32 0, i32* %b, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar5( +; CHECK-NOT: call void @llvm.memset +} + +;void bar6(foo2_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 0; +; f[i].b = 0; +; f[i].c = 0; +; } +;} +define void @bar6(%struct.foo2* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 1 + store i16 0, i16* %b, align 4 + %c = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 2 + store i16 0, i16* %c, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar6( +; CHECK: call void @llvm.memset +; CHECK-NOT: store +} diff --git a/test/Transforms/LoopIdiom/struct_pattern.ll b/test/Transforms/LoopIdiom/struct_pattern.ll new file mode 100644 index 00000000000..d7809b746b1 --- /dev/null +++ b/test/Transforms/LoopIdiom/struct_pattern.ll @@ -0,0 +1,186 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16 +; CHECK: @.memset_pattern.1 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16 +; CHECK: @.memset_pattern.2 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16 + +target triple = "x86_64-apple-darwin10.0.0" + +%struct.foo = type { i32, i32 } +%struct.foo1 = type { i32, i32, i32 } + +;void bar1(foo_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 2; +; f[i].b = 2; +; } +;} +define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 2, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 2, i32* %b, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar1( +; CHECK: call void @memset_pattern16 +; CHECK-NOT: store +} + +;void bar2(foo_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].b = 2; +; f[i].a = 2; +; } +;} +define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 2, i32* %b, align 4 + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 2, i32* %a, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar2( +; CHECK: call void @memset_pattern16 +; CHECK-NOT: store +} + +;void bar3(foo_t *f, unsigned n) { +; for (unsigned i = n; i > 0; --i) { +; f[i].a = 2; +; f[i].b = 2; +; } +;} +define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 2, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 2, i32* %b, align 4 + %1 = trunc i64 %indvars.iv to i32 + %dec = add i32 %1, -1 + %cmp = icmp eq i32 %dec, 0 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar3( +; CHECK: call void @memset_pattern16 +; CHECK-NOT: store +} + +;void bar4(foo_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 0; +; f[i].b = 1; +; } +;} +define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0 + store i32 0, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1 + store i32 1, i32* %b, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar4( +; CHECK-NOT: call void @memset_pattern16 +} + +;void bar5(foo1_t *f, unsigned n) { +; for (unsigned i = 0; i < n; ++i) { +; f[i].a = 1; +; f[i].b = 1; +; } +;} +define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp { +entry: + %cmp1 = icmp eq i32 %n, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0 + store i32 1, i32* %a, align 4 + %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1 + store i32 1, i32* %b, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @bar5( +; CHECK-NOT: call void @memset_pattern16 +} diff --git a/test/Transforms/LoopIdiom/unroll.ll b/test/Transforms/LoopIdiom/unroll.ll new file mode 100644 index 00000000000..0cdfda254d7 --- /dev/null +++ b/test/Transforms/LoopIdiom/unroll.ll @@ -0,0 +1,80 @@ +; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" + +; CHECK @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16 + +target triple = "x86_64-apple-darwin10.0.0" + +;void test(int *f, unsigned n) { +; for (unsigned i = 0; i < 2 * n; i += 2) { +; f[i] = 0; +; f[i+1] = 0; +; } +;} +define void @test(i32* %f, i32 %n) nounwind ssp { +entry: + %mul = shl i32 %n, 1 + %cmp1 = icmp eq i32 %mul, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %mul to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 0, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %1 + store i32 0, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @test( +; CHECK: call void @llvm.memset +; CHECK-NOT: store +} + +;void test_pattern(int *f, unsigned n) { +; for (unsigned i = 0; i < 2 * n; i += 2) { +; f[i] = 2; +; f[i+1] = 2; +; } +;} +define void @test_pattern(i32* %f, i32 %n) nounwind ssp { +entry: + %mul = shl i32 %n, 1 + %cmp1 = icmp eq i32 %mul, 0 + br i1 %cmp1, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = zext i32 %mul to i64 + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv + store i32 2, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %1 + store i32 2, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, %0 + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +; CHECK-LABEL: @test_pattern( +; CHECK: call void @memset_pattern16 +; CHECK-NOT: store +}