From 9d77533d54d24cc091cb08fc3af0e3e3c9c7e475 Mon Sep 17 00:00:00 2001
From: Haicheng Wu <haicheng@codeaurora.org>
Date: Sat, 23 Jan 2016 06:52:41 +0000
Subject: [PATCH] [LIR] Add support for structs and hand unrolled loops

Now LIR can turn following codes into memset:

typedef struct foo {
  int a;
  int b;
} foo_t;

void bar(foo_t *f, unsigned n) {
  for (unsigned i = 0; i < n; ++i) {
    f[i].a = 0;
    f[i].b = 0;
  }
}

void test(foo_t *f, unsigned n) {
  for (unsigned i = 0; i < n; i += 2) {
    f[i] = 0;
    f[i+1] = 0;
  }
}

llvm-svn: 258620
---
 include/llvm/Analysis/LoopAccessAnalysis.h   |   5 +
 lib/Analysis/LoopAccessAnalysis.cpp          |  72 ++++++
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 246 +++++++++++++++----
 lib/Transforms/Vectorize/SLPVectorizer.cpp   |  89 +------
 test/Transforms/LoopIdiom/struct.ll          | 221 +++++++++++++++++
 test/Transforms/LoopIdiom/struct_pattern.ll  | 186 ++++++++++++++
 test/Transforms/LoopIdiom/unroll.ll          |  80 ++++++
 7 files changed, 775 insertions(+), 124 deletions(-)
 create mode 100644 test/Transforms/LoopIdiom/struct.ll
 create mode 100644 test/Transforms/LoopIdiom/struct_pattern.ll
 create mode 100644 test/Transforms/LoopIdiom/unroll.ll
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 871d35e99b7..7edd1cf73dc 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -659,6 +659,11 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                  const ValueToValueMap &StridesMap);
 
+/// \brief Returns true if the memory operations \p A and \p B are consecutive.
+/// This is a simple API that does not depend on the analysis pass. 
+bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
+                         ScalarEvolution &SE, bool CheckType = true);
+
 /// \brief This analysis provides dependence information for the memory accesses
 /// of a loop.
 ///
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 84f9fa69463..a2ab231a62d 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -901,6 +901,78 @@ int llvm::isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr,
   return Stride;
 }
 
+/// Take the pointer operand from the Load/Store instruction.
+/// Returns NULL if this is not a valid Load/Store instruction.
+static Value *getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
+/// Take the address space operand from the Load/Store instruction.
+/// Returns -1 if this is not a valid Load/Store instruction.
+static unsigned getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+/// Returns true if the memory operations \p A and \p B are consecutive.
+bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
+                               ScalarEvolution &SE, bool CheckType) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers.
+  if (PtrA == PtrB)
+    return false;
+
+  // Make sure that A and B have the same type if required.
+  if(CheckType && PtrA->getType() != PtrB->getType())
+      return false;
+
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+  //  OffsetDelta = OffsetB - OffsetA;
+  const SCEV *OffsetSCEVA = SE.getConstant(OffsetA);
+  const SCEV *OffsetSCEVB = SE.getConstant(OffsetB);
+  const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
+  const SCEVConstant *OffsetDeltaC = dyn_cast<SCEVConstant>(OffsetDeltaSCEV);
+  const APInt &OffsetDelta = OffsetDeltaC->getAPInt();
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  // BaseDelta = Size - OffsetDelta;
+  const SCEV *SizeSCEV = SE.getConstant(Size);
+  const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV);
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+  const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta);
+  return X == PtrSCEVB;
+}
+
 bool MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
   switch (Type) {
   case NoDep:
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 4521640e394..bb113ccf16e 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -26,22 +26,20 @@
 // i64 and larger types when i64 is legal and the value has few bits set.  It
 // would be good to enhance isel to emit a loop for ctpop in this case.
 //
-// We should enhance the memset/memcpy recognition to handle multiple stores in
-// the loop.  This would handle things like:
-//   void foo(_Complex float *P)
-//     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }
-//
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -108,7 +106,9 @@ public:
 
 private:
   typedef SmallVector<StoreInst *, 8> StoreList;
-  StoreList StoreRefsForMemset;
+  typedef MapVector<Value *, StoreList> StoreListMap;
+  StoreListMap StoreRefsForMemset;
+  StoreListMap StoreRefsForMemsetPattern;
   StoreList StoreRefsForMemcpy;
   bool HasMemset;
   bool HasMemsetPattern;
@@ -122,14 +122,18 @@ private:
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   void collectStores(BasicBlock *BB);
-  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemcpy);
-  bool processLoopStore(StoreInst *SI, const SCEV *BECount);
+  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern,
+                    bool &ForMemcpy);
+  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
+                         bool ForMemset);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 
   bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                unsigned StoreAlignment, Value *StoredVal,
-                               Instruction *TheStore, const SCEVAddRecExpr *Ev,
-                               const SCEV *BECount, bool NegStride);
+                               Instruction *TheStore,
+                               SmallPtrSetImpl<Instruction *> &Stores,
+                               const SCEVAddRecExpr *Ev, const SCEV *BECount,
+                               bool NegStride);
   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
 
   /// @}
@@ -305,7 +309,7 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
 }
 
 bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
-                                      bool &ForMemcpy) {
+                                      bool &ForMemsetPattern, bool &ForMemcpy) {
   // Don't touch volatile stores.
   if (!SI->isSimple())
     return false;
@@ -353,7 +357,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
              StorePtr->getType()->getPointerAddressSpace() == 0 &&
              (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
     // It looks like we can use PatternValue!
-    ForMemset = true;
+    ForMemsetPattern = true;
     return true;
   }
 
@@ -393,6 +397,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
 
 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
   StoreRefsForMemset.clear();
+  StoreRefsForMemsetPattern.clear();
   StoreRefsForMemcpy.clear();
   for (Instruction &I : *BB) {
     StoreInst *SI = dyn_cast<StoreInst>(&I);
@@ -400,15 +405,22 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
       continue;
 
     bool ForMemset = false;
+    bool ForMemsetPattern = false;
     bool ForMemcpy = false;
     // Make sure this is a strided store with a constant stride.
-    if (!isLegalStore(SI, ForMemset, ForMemcpy))
+    if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy))
       continue;
 
     // Save the store locations.
-    if (ForMemset)
-      StoreRefsForMemset.push_back(SI);
-    else if (ForMemcpy)
+    if (ForMemset) {
+      // Find the base pointer.
+      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      StoreRefsForMemset[Ptr].push_back(SI);
+    } else if (ForMemsetPattern) {
+      // Find the base pointer.
+      Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
+      StoreRefsForMemsetPattern[Ptr].push_back(SI);
+    } else if (ForMemcpy)
       StoreRefsForMemcpy.push_back(SI);
   }
 }
@@ -430,9 +442,14 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   // Look for store instructions, which may be optimized to memset/memcpy.
   collectStores(BB);
 
-  // Look for a single store which can be optimized into a memset.
-  for (auto &SI : StoreRefsForMemset)
-    MadeChange |= processLoopStore(SI, BECount);
+  // Look for a single store or sets of stores with a common base, which can be
+  // optimized into a memset (memset_pattern).  The latter most commonly happens
+  // with structs and handunrolled loops.
+  for (auto &SL : StoreRefsForMemset)
+    MadeChange |= processLoopStores(SL.second, BECount, true);
+
+  for (auto &SL : StoreRefsForMemsetPattern)
+    MadeChange |= processLoopStores(SL.second, BECount, false);
 
   // Optimize the store into a memcpy, if it feeds an similarly strided load.
   for (auto &SI : StoreRefsForMemcpy)
@@ -458,26 +475,155 @@ bool LoopIdiomRecognize::runOnLoopBlock(
   return MadeChange;
 }
 
-/// processLoopStore - See if this store can be promoted to a memset.
-bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
-  assert(SI->isSimple() && "Expected only non-volatile stores.");
+/// processLoopStores - See if this store(s) can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
+                                           const SCEV *BECount,
+                                           bool ForMemset) {
+  // Try to find consecutive stores that can be transformed into memsets.
+  SetVector<StoreInst *> Heads, Tails;
+  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
 
-  Value *StoredVal = SI->getValueOperand();
-  Value *StorePtr = SI->getPointerOperand();
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of stores that follow each other.
+  SmallVector<unsigned, 16> IndexQueue;
+  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
+    assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
 
-  // Check to see if the stride matches the size of the store.  If so, then we
-  // know that every byte is touched in the loop.
-  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
-  unsigned Stride = getStoreStride(StoreEv);
-  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
-  if (StoreSize != Stride && StoreSize != -Stride)
-    return false;
+    Value *FirstStoredVal = SL[i]->getValueOperand();
+    Value *FirstStorePtr = SL[i]->getPointerOperand();
+    const SCEVAddRecExpr *FirstStoreEv =
+        cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
+    unsigned FirstStride = getStoreStride(FirstStoreEv);
+    unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
 
-  bool NegStride = StoreSize == -Stride;
+    // See if we can optimize just this store in isolation.
+    if (FirstStride == FirstStoreSize || FirstStride == -FirstStoreSize) {
+      Heads.insert(SL[i]);
+      continue;
+    }
 
-  // See if we can optimize just this store in isolation.
-  return processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
-                                 StoredVal, SI, StoreEv, BECount, NegStride);
+    Value *FirstSplatValue = nullptr;
+    Constant *FirstPatternValue = nullptr;
+
+    if (ForMemset)
+      FirstSplatValue = isBytewiseValue(FirstStoredVal);
+    else
+      FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
+
+    assert((FirstSplatValue || FirstPatternValue) &&
+           "Expected either splat value or pattern value.");
+
+    IndexQueue.clear();
+    // If a store has multiple consecutive store candidates, search Stores
+    // array according to the sequence: from i+1 to e, then from i-1 to 0.
+    // This is because usually pairing with immediate succeeding or preceding
+    // candidate create the best chance to find memset opportunity.
+    unsigned j = 0;
+    for (j = i + 1; j < e; ++j)
+      IndexQueue.push_back(j);
+    for (j = i; j > 0; --j)
+      IndexQueue.push_back(j - 1);
+
+    for (auto &k : IndexQueue) {
+      assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
+      Value *SecondStorePtr = SL[k]->getPointerOperand();
+      const SCEVAddRecExpr *SecondStoreEv =
+          cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
+      unsigned SecondStride = getStoreStride(SecondStoreEv);
+
+      if (FirstStride != SecondStride)
+        continue;
+
+      Value *SecondStoredVal = SL[k]->getValueOperand();
+      Value *SecondSplatValue = nullptr;
+      Constant *SecondPatternValue = nullptr;
+
+      if (ForMemset)
+        SecondSplatValue = isBytewiseValue(SecondStoredVal);
+      else
+        SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
+
+      assert((SecondSplatValue || SecondPatternValue) &&
+             "Expected either splat value or pattern value.");
+
+      if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
+        if (ForMemset) {
+          ConstantInt *C1 = dyn_cast<ConstantInt>(FirstSplatValue);
+          ConstantInt *C2 = dyn_cast<ConstantInt>(SecondSplatValue);
+          if (!C1 || !C2 || C1 != C2)
+            continue;
+        } else {
+          Constant *C1 = FirstPatternValue;
+          Constant *C2 = SecondPatternValue;
+
+          if (ConstantArray *CA1 = dyn_cast<ConstantArray>(C1))
+            C1 = CA1->getSplatValue();
+
+          if (ConstantArray *CA2 = dyn_cast<ConstantArray>(C2))
+            C2 = CA2->getSplatValue();
+
+          if (C1 != C2)
+            continue;
+        }
+        Tails.insert(SL[k]);
+        Heads.insert(SL[i]);
+        ConsecutiveChain[SL[i]] = SL[k];
+        break;
+      }
+    }
+  }
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we transformed so that we don't visit the same store twice.
+  SmallPtrSet<Value *, 16> TransformedStores;
+  bool Changed = false;
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to transform it.
+    SmallPtrSet<Instruction *, 8> AdjacentStores;
+    StoreInst *I = *it;
+
+    StoreInst *HeadStore = I;
+    unsigned StoreSize = 0;
+
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (TransformedStores.count(I))
+        break;
+      AdjacentStores.insert(I);
+
+      StoreSize += getStoreSizeInBytes(I, DL);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    Value *StoredVal = HeadStore->getValueOperand();
+    Value *StorePtr = HeadStore->getPointerOperand();
+    const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+    unsigned Stride = getStoreStride(StoreEv);
+
+    // Check to see if the stride matches the size of the stores.  If so, then
+    // we know that every byte is touched in the loop.
+    if (StoreSize != Stride && StoreSize != -Stride)
+      continue;
+
+    bool NegStride = StoreSize == -Stride;
+
+    if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
+                                StoredVal, HeadStore, AdjacentStores, StoreEv,
+                                BECount, NegStride)) {
+      TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
+      Changed = true;
+    }
+  }
+
+  return Changed;
 }
 
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
@@ -520,18 +666,21 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
     return false;
 
+  SmallPtrSet<Instruction *, 1> MSIs;
+  MSIs.insert(MSI);
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getAlignment(), SplatValue, MSI, Ev,
+                                 MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
                                  BECount, /*NegStride=*/false);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
 /// specified pointer location, which is a loop-strided access.  The 'Access'
 /// argument specifies what the verboten forms of access are (read or write).
-static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
-                                  const SCEV *BECount, unsigned StoreSize,
-                                  AliasAnalysis &AA,
-                                  Instruction *IgnoredStore) {
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                      const SCEV *BECount, unsigned StoreSize,
+                      AliasAnalysis &AA,
+                      SmallPtrSetImpl<Instruction *> &IgnoredStores) {
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
   // at the pointer and has infinite size.
@@ -551,7 +700,8 @@ static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
        ++BI)
     for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
-      if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access))
+      if (IgnoredStores.count(&*I) == 0 &&
+          (AA.getModRefInfo(&*I, StoreLoc) & Access))
         return true;
 
   return false;
@@ -574,7 +724,8 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 bool LoopIdiomRecognize::processLoopStridedStore(
     Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
-    Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev,
+    Value *StoredVal, Instruction *TheStore,
+    SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
     const SCEV *BECount, bool NegStride) {
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = nullptr;
@@ -609,7 +760,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   Value *BasePtr =
       Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
   if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
-                            *AA, TheStore)) {
+                            *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
@@ -662,7 +813,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, TLI);
+  for (auto *I : Stores)
+    deleteDeadInstruction(I, TLI);
   ++NumMemSet;
   return true;
 }
@@ -714,8 +866,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *StoreBasePtr = Expander.expandCodeFor(
       StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
 
+  SmallPtrSet<Instruction *, 1> Stores;
+  Stores.insert(SI);
   if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
-                            StoreSize, *AA, SI)) {
+                            StoreSize, *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
@@ -735,7 +889,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
 
   if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
-                            *AA, SI)) {
+                            *AA, Stores)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2520c78b538..8989b13cccc 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -401,9 +402,6 @@ public:
     }
   }
 
-  /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
-
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -438,14 +436,6 @@ private:
   /// vectorized, or NULL. They may happen in cycles.
   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 
-  /// \brief Take the pointer operand from the Load/Store instruction.
-  /// \returns NULL if this is not a valid Load/Store instruction.
-  static Value *getPointerOperand(Value *I);
-
-  /// \brief Take the address space operand from the Load/Store instruction.
-  /// \returns -1 if this is not a valid Load/Store instruction.
-  static unsigned getAddressSpaceOperand(Value *I);
-
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
   int getGatherCost(Type *Ty);
@@ -1191,8 +1181,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           return;
         }
 
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
-          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) {
+          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL, *SE)) {
             ++NumLoadsWantToChangeOrder;
           }
           BS.cancelScheduling(VL);
@@ -1364,7 +1354,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       const DataLayout &DL = F->getParent()->getDataLayout();
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL, *SE)) {
           BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
@@ -1837,63 +1827,6 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
   return getGatherCost(VecTy);
 }
 
-Value *BoUpSLP::getPointerOperand(Value *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return nullptr;
-}
-
-unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
-  if (LoadInst *L = dyn_cast<LoadInst>(I))
-    return L->getPointerAddressSpace();
-  if (StoreInst *S = dyn_cast<StoreInst>(I))
-    return S->getPointerAddressSpace();
-  return -1;
-}
-
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
-  unsigned ASA = getAddressSpaceOperand(A);
-  unsigned ASB = getAddressSpaceOperand(B);
-
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB))
-    return false;
-
-  // Make sure that A and B are different pointers of the same type.
-  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
-    return false;
-
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
-  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
-  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
-  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
-  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
-
-  APInt OffsetDelta = OffsetB - OffsetA;
-
-  // Check if they are based on the same pointer. That makes the offsets
-  // sufficient.
-  if (PtrA == PtrB)
-    return OffsetDelta == Size;
-
-  // Compute the necessary base pointer delta to have the necessary final delta
-  // equal to the size.
-  APInt BaseDelta = Size - OffsetDelta;
-
-  // Otherwise compute the distance with SCEV between the base pointers.
-  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *C = SE->getConstant(BaseDelta);
-  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
-  return X == PtrSCEVB;
-}
-
 // Reorder commutative operations in alternate shuffle if the resulting vectors
 // are consecutive loads. This would allow us to vectorize the tree.
 // If we have something like-
@@ -1921,10 +1854,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+        if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+        } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -1935,10 +1868,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+        if (isConsecutiveAccess(L, L1, DL, *SE) && VL1->isCommutative()) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+        } else if (isConsecutiveAccess(L, L1, DL, *SE) && VL2->isCommutative()) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2088,7 +2021,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   for (unsigned j = 0; j < VL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        if (isConsecutiveAccess(L, L1, DL)) {
+        if (isConsecutiveAccess(L, L1, DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2096,7 +2029,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        if (isConsecutiveAccess(L, L1, DL)) {
+        if (isConsecutiveAccess(L, L1, DL, *SE)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -3461,7 +3394,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
       IndexQueue.push_back(j - 1);
 
     for (auto &k : IndexQueue) {
-      if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+      if (isConsecutiveAccess(Stores[i], Stores[k], DL, *SE)) {
         Tails.insert(Stores[k]);
         Heads.insert(Stores[i]);
         ConsecutiveChain[Stores[i]] = Stores[k];
diff --git a/test/Transforms/LoopIdiom/struct.ll b/test/Transforms/LoopIdiom/struct.ll
new file mode 100644
index 00000000000..2828024952e
--- /dev/null
+++ b/test/Transforms/LoopIdiom/struct.ll
@@ -0,0 +1,221 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+%struct.foo = type { i32, i32 }
+%struct.foo1 = type { i32, i32, i32 }
+%struct.foo2 = type { i32, i16, i16 }
+
+;void bar1(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar1(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar2(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].b = 0;
+;    f[i].a = 0;
+;  }
+;}
+define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar2(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar3(foo_t *f, unsigned n) {
+;  for (unsigned i = n; i > 0; --i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %dec = add i32 %1, -1
+  %cmp = icmp eq i32 %dec, 0
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar3(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void bar4(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar4(
+; CHECK-NOT: call void @llvm.memset 
+}
+
+;void bar5(foo1_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;  }
+;}
+define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1
+  store i32 0, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar5(
+; CHECK-NOT: call void @llvm.memset 
+}
+
+;void bar6(foo2_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 0;
+;    f[i].c = 0;
+;  }
+;}
+define void @bar6(%struct.foo2* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 1
+  store i16 0, i16* %b, align 4
+  %c = getelementptr inbounds %struct.foo2, %struct.foo2* %f, i64 %indvars.iv, i32 2
+  store i16 0, i16* %c, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar6(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
diff --git a/test/Transforms/LoopIdiom/struct_pattern.ll b/test/Transforms/LoopIdiom/struct_pattern.ll
new file mode 100644
index 00000000000..d7809b746b1
--- /dev/null
+++ b/test/Transforms/LoopIdiom/struct_pattern.ll
@@ -0,0 +1,186 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+; CHECK: @.memset_pattern.1 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+; CHECK: @.memset_pattern.2 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+%struct.foo = type { i32, i32 }
+%struct.foo1 = type { i32, i32, i32 }
+
+;void bar1(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 2;
+;    f[i].b = 2;
+;  }
+;}
+define void @bar1(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 2, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 2, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar1(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}
+
+;void bar2(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].b = 2;
+;    f[i].a = 2;
+;  }
+;}
+define void @bar2(%struct.foo* %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 2, i32* %b, align 4
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 2, i32* %a, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar2(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}
+
+;void bar3(foo_t *f, unsigned n) {
+;  for (unsigned i = n; i > 0; --i) {
+;    f[i].a = 2;
+;    f[i].b = 2;
+;  }
+;}
+define void @bar3(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 2, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 2, i32* %b, align 4
+  %1 = trunc i64 %indvars.iv to i32
+  %dec = add i32 %1, -1
+  %cmp = icmp eq i32 %dec, 0
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar3(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}
+
+;void bar4(foo_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 0;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar4(%struct.foo* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 0
+  store i32 0, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo, %struct.foo* %f, i64 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar4(
+; CHECK-NOT: call void @memset_pattern16 
+}
+
+;void bar5(foo1_t *f, unsigned n) {
+;  for (unsigned i = 0; i < n; ++i) {
+;    f[i].a = 1;
+;    f[i].b = 1;
+;  }
+;}
+define void @bar5(%struct.foo1* nocapture %f, i32 %n) nounwind ssp {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %a = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.foo1, %struct.foo1* %f, i64 %indvars.iv, i32 1
+  store i32 1, i32* %b, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @bar5(
+; CHECK-NOT: call void @memset_pattern16
+}
diff --git a/test/Transforms/LoopIdiom/unroll.ll b/test/Transforms/LoopIdiom/unroll.ll
new file mode 100644
index 00000000000..0cdfda254d7
--- /dev/null
+++ b/test/Transforms/LoopIdiom/unroll.ll
@@ -0,0 +1,80 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+;void test(int *f, unsigned n) {
+;  for (unsigned i = 0; i < 2 * n; i += 2) {
+;    f[i] = 0;
+;    f[i+1] = 0;
+;  }
+;}
+define void @test(i32* %f, i32 %n) nounwind ssp {
+entry:
+  %mul = shl i32 %n, 1
+  %cmp1 = icmp eq i32 %mul, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %mul to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  store i32 0, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %1
+  store i32 0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @test(
+; CHECK: call void @llvm.memset
+; CHECK-NOT: store
+}
+
+;void test_pattern(int *f, unsigned n) {
+;  for (unsigned i = 0; i < 2 * n; i += 2) {
+;    f[i] = 2;
+;    f[i+1] = 2;
+;  }
+;}
+define void @test_pattern(i32* %f, i32 %n) nounwind ssp {
+entry:
+  %mul = shl i32 %n, 1
+  %cmp1 = icmp eq i32 %mul, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %0 = zext i32 %mul to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  store i32 2, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %f, i64 %1
+  store i32 2, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+; CHECK-LABEL: @test_pattern(
+; CHECK: call void @memset_pattern16
+; CHECK-NOT: store
+}