[SLP] Vectorize loads of consecutive memory accesses, accessed in non-consecutive (jumbled) way.

The jumbled scalar loads will be sorted while building the tree and these accesses will be marked to generate shufflevector after the vectorized load with proper mask. Reviewers: hfinkel, mssimpso, mkuper Differential Revision: https://reviews.llvm.org/D26905 Change-Id: I9c0c8e6f91a00076a7ee1465440a3f6ae092f7ad llvm-svn: 293386
2024-10-19 19:12:56 +02:00 · 2017-01-28 17:59:44 +00:00 · 2017-01-28 17:59:44 +00:00 · a2646e67e7
commit a2646e67e7
parent 82b0fc2a51
6 changed files with 178 additions and 93 deletions
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@ -690,6 +690,11 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                     const ValueToValueMap &StridesMap = ValueToValueMap(),
                     bool Assume = false, bool ShouldCheckWrap = true);

+/// \brief Saves the sorted memory accesses in vector argument 'Sorted' after
+/// sorting the jumbled memory accesses.
+void sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                     ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted);
+
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@ -1058,6 +1058,37 @@ static unsigned getAddressSpaceOperand(Value *I) {
  return -1;
 }

+/// Saves the memory accesses after sorting it into vector argument 'Sorted'.
+void llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                         ScalarEvolution &SE,
+                         SmallVectorImpl<Value *> &Sorted) {
+  SmallVector<std::pair<int, Value *>, 4> OffValPairs;
+  for (auto *Val : VL) {
+    // Compute the constant offset from the base pointer of each memory accesses
+    // and insert into the vector of key,value pair which needs to be sorted.
+    Value *Ptr = getPointerOperand(Val);
+    unsigned AS = getAddressSpaceOperand(Val);
+    unsigned PtrBitWidth = DL.getPointerSizeInBits(AS);
+    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
+    APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+    // FIXME: Currently the offsets are assumed to be constant.However this not
+    // always true as offsets can be variables also and we would need to
+    // consider the difference of the variable offsets.
+    APInt Offset(PtrBitWidth, 0);
+    Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+    OffValPairs.push_back(std::make_pair(Offset.getSExtValue(), Val));
+  }
+  std::sort(OffValPairs.begin(), OffValPairs.end(),
+            [](const std::pair<int, Value *> &Left,
+               const std::pair<int, Value *> &Right) {
+              return Left.first < Right.first;
+            });
+
+  for (auto& it : OffValPairs)
+    Sorted.push_back(it.second);
+}
+
 /// Returns true if the memory operations \p A and \p B are consecutive.
 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                               ScalarEvolution &SE, bool CheckType) {
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -410,8 +410,10 @@ private:
  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
  bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;

-  /// Vectorize a single entry in the tree.
-  Value *vectorizeTree(TreeEntry *E);
+  /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars
+  /// in order of its usage in a user program, for example ADD1, ADD2 and so on
+  /// or LOAD1 , LOAD2 etc.
+  Value *vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E);

  /// Vectorize a single entry in the tree, starting in \p VL.
  Value *vectorizeTree(ArrayRef<Value *> VL);
@ -452,7 +454,7 @@ private:
                                      SmallVectorImpl<Value *> &Right);
  struct TreeEntry {
    TreeEntry() : Scalars(), VectorizedValue(nullptr),
-    NeedToGather(0) {}
+    NeedToGather(0), NeedToShuffle(0) {}

    /// \returns true if the scalars in VL are equal to this entry.
    bool isSame(ArrayRef<Value *> VL) const {
@ -460,6 +462,15 @@ private:
      return std::equal(VL.begin(), VL.end(), Scalars.begin());
    }

+    /// \returns true if the scalars in VL are found in this tree entry.
+    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
+                        ScalarEvolution &SE) const {
+      assert(VL.size() == Scalars.size() && "Invalid size");
+      SmallVector<Value *, 8> List;
+      sortMemAccesses(VL, DL, SE, List);
+      return std::equal(List.begin(), List.end(), Scalars.begin());
+    }
+
    /// A vector of scalars.
    ValueList Scalars;

@ -468,15 +479,20 @@ private:

    /// Do we need to gather this sequence ?
    bool NeedToGather;
+
+    /// Do we need to shuffle the load ?
+    bool NeedToShuffle;
  };

  /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+                          bool NeedToShuffle) {
    VectorizableTree.emplace_back();
    int idx = VectorizableTree.size() - 1;
    TreeEntry *Last = &VectorizableTree[idx];
    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
    Last->NeedToGather = !Vectorized;
+    Last->NeedToShuffle = NeedToShuffle;
    if (Vectorized) {
      for (int i = 0, e = VL.size(); i != e; ++i) {
        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
@ -983,21 +999,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

  if (Depth == RecursionMaxDepth) {
    DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
    return;
  }

  // Don't handle vectors.
  if (VL[0]->getType()->isVectorTy()) {
    DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
    return;
  }

  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
    if (SI->getValueOperand()->getType()->isVectorTy()) {
      DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
      return;
    }
  unsigned Opcode = getSameOpcode(VL);
@ -1014,7 +1030,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
  // If all of the operands are identical or constant we have a simple solution.
  if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
    DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
    return;
  }

@ -1026,7 +1042,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    if (EphValues.count(VL[i])) {
      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
            ") is ephemeral.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
      return;
    }
  }
@ -1039,7 +1055,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
      if (E->Scalars[i] != VL[i]) {
        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
        return;
      }
    }
@ -1052,7 +1068,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    if (ScalarToTreeEntry.count(VL[i])) {
      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
            ") is already in tree.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
      return;
    }
  }
@ -1062,7 +1078,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
    if (MustGather.count(VL[i])) {
      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
      return;
    }
  }
@ -1076,7 +1092,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    // Don't go into unreachable blocks. They may contain instructions with
    // dependency cycles which confuse the final scheduling.
    DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
    return;
  }

@ -1085,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    for (unsigned j = i+1; j < e; ++j)
      if (VL[i] == VL[j]) {
        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
        return;
      }

@ -1100,7 +1116,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    assert((!BS.getScheduleData(VL[0]) ||
            !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
           "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
    return;
  }
  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@ -1117,12 +1133,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
          if (Term) {
            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
            BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            newTreeEntry(VL, false, false);
            return;
          }
        }

-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");

      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@ -1144,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      } else {
        BS.cancelScheduling(VL);
      }
-      newTreeEntry(VL, Reuse);
+      newTreeEntry(VL, Reuse, false);
      return;
    }
    case Instruction::Load: {
@ -1160,7 +1176,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      if (DL->getTypeSizeInBits(ScalarTy) !=
          DL->getTypeAllocSizeInBits(ScalarTy)) {
        BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
        DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
        return;
      }
@ -1171,15 +1187,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
        LoadInst *L = cast<LoadInst>(VL[i]);
        if (!L->isSimple()) {
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
          return;
        }
      }

      // Check if the loads are consecutive, reversed, or neither.
-      // TODO: What we really want is to sort the loads, but for now, check
-      // the two likely directions.
      bool Consecutive = true;
      bool ReverseConsecutive = true;
      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@ -1193,7 +1207,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {

      if (Consecutive) {
        ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true);
+        newTreeEntry(VL, true, false);
        DEBUG(dbgs() << "SLP: added a vector of loads.\n");
        return;
      }
@ -1207,8 +1221,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
            break;
          }

+      if (VL.size() > 2 && !ReverseConsecutive) {
+        bool ShuffledLoads = true;
+        SmallVector<Value *, 8> List;
+        sortMemAccesses(VL, *DL, *SE, List);
+        auto NewVL = makeArrayRef(List.begin(), List.end());
+        for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
+          if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
+            ShuffledLoads = false;
+            break;
+          }
+        }
+        if (ShuffledLoads) {
+          newTreeEntry(NewVL, true, true);
+          return;
+        }
+      }
+
      BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);

      if (ReverseConsecutive) {
        ++NumLoadsWantToChangeOrder;
@ -1231,16 +1262,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    case Instruction::FPTrunc:
    case Instruction::BitCast: {
      Type *SrcTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+      for (Value *Val : VL) {
+        Type *Ty = cast<Instruction>(Val)->getOperand(0)->getType();
        if (Ty != SrcTy || !isValidElementType(Ty)) {
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
          return;
        }
      }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a vector of casts.\n");

      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@ -1263,13 +1294,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
        if (Cmp->getPredicate() != P0 ||
            Cmp->getOperand(0)->getType() != ComparedTy) {
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
          return;
        }
      }

-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a vector of compares.\n");

      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@ -1301,7 +1332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    case Instruction::And:
    case Instruction::Or:
    case Instruction::Xor: {
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a vector of bin op.\n");

      // Sort operands of the instructions so that each side is more likely to
@ -1326,11 +1357,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    }
    case Instruction::GetElementPtr: {
      // We don't combine GEPs with complicated (nested) indexing.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+      for (Value *Val : VL) {
+        if (cast<Instruction>(Val)->getNumOperands() != 2) {
          DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          return;
        }
      }
@ -1338,29 +1369,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      // We can't combine several GEPs into one vector if they operate on
      // different types.
      Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+      for (Value *Val : VL) {
+        Type *CurTy = cast<Instruction>(Val)->getOperand(0)->getType();
        if (Ty0 != CurTy) {
          DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          return;
        }
      }

      // We don't combine GEPs with non-constant indexes.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+      for (Value *Val : VL) {
+        auto Op = cast<Instruction>(Val)->getOperand(1);
        if (!isa<ConstantInt>(Op)) {
          DEBUG(
              dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          return;
        }
      }

-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
      for (unsigned i = 0, e = 2; i < e; ++i) {
        ValueList Operands;
@ -1377,12 +1408,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
          return;
        }

-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a vector of stores.\n");

      ValueList Operands;
@ -1400,7 +1431,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
      if (!isTriviallyVectorizable(ID)) {
        BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
        DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
        return;
      }
@ -1414,7 +1445,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
            getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
            !CI->hasIdenticalOperandBundleSchema(*CI2)) {
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                       << "\n");
          return;
@ -1425,7 +1456,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
          Value *A1J = CI2->getArgOperand(1);
          if (A1I != A1J) {
            BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            newTreeEntry(VL, false, false);
            DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                         << " argument "<< A1I<<"!=" << A1J
                         << "\n");
@ -1438,14 +1469,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
                        CI->op_begin() + CI->getBundleOperandsEndIndex(),
                        CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
          DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                       << *VL[i] << '\n');
          return;
        }
      }

-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
        ValueList Operands;
        // Prepare the operand vector.
@ -1462,11 +1493,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
      // then do not vectorize this instruction.
      if (!isAltShuffle) {
        BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
        return;
      }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");

      // Reorder operands if reordering would enable vectorization.
@ -1490,7 +1521,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
    }
    default:
      BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
      DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
      return;
  }
@ -1723,6 +1754,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
                                           VecTy, alignment, 0);
+      if (E->NeedToShuffle) {
+        VecLdCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
+      }
      return VecLdCost - ScalarLdCost;
    }
    case Instruction::Store: {
@ -2285,8 +2320,8 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
  if (ScalarToTreeEntry.count(VL[0])) {
    int Idx = ScalarToTreeEntry[VL[0]];
    TreeEntry *E = &VectorizableTree[Idx];
-    if (E->isSame(VL))
-      return vectorizeTree(E);
+    if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE)))
+      return vectorizeTree(VL, E);
  }

  Type *ScalarTy = VL[0]->getType();
@ -2297,10 +2332,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
  return Gather(VL, VecTy);
 }

-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
  IRBuilder<>::InsertPointGuard Guard(Builder);

-  if (E->VectorizedValue) {
+  if (E->VectorizedValue && !E->NeedToShuffle) {
    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
    return E->VectorizedValue;
  }
@ -2534,7 +2569,35 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
      LI->setAlignment(Alignment);
      E->VectorizedValue = LI;
      ++NumVectorInstructions;
-      return propagateMetadata(LI, E->Scalars);
+      propagateMetadata(LI, E->Scalars);
+
+      // As program order of scalar loads are jumbled, the vectorized 'load'
+      // must be followed by a 'shuffle' with the required jumbled mask.
+      if (!VL.empty() && (E->NeedToShuffle)) {
+        assert(VL.size() == E->Scalars.size() &&
+               "Equal number of scalars expected");
+        SmallVector<Constant *, 8> Mask;
+        for (Value *Val : VL) {
+          if (ScalarToTreeEntry.count(Val)) {
+            int Idx = ScalarToTreeEntry[Val];
+            TreeEntry *E = &VectorizableTree[Idx];
+            for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+              if (E->Scalars[Lane] == Val) {
+                Mask.push_back(Builder.getInt32(Lane));
+                break;
+              }
+            }
+          }
+        }
+
+        // Generate shuffle for jumbled memory access
+        Value *Undef = UndefValue::get(VecTy);
+        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
+                                                  ConstantVector::get(Mask));
+        return Shuf;
+      }
+
+      return LI;
    }
    case Instruction::Store: {
      StoreInst *SI = cast<StoreInst>(VL0);
@ -2709,7 +2772,7 @@ Value *BoUpSLP::vectorizeTree() {
  }

  Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
+  auto *VectorRoot = vectorizeTree(ArrayRef<Value *>(), &VectorizableTree[0]);

  // If the vectorized tree can be rewritten in a smaller type, we truncate the
  // vectorized root. InstCombine will then rewrite the entire expression. We
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@ -1,38 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s



 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
-; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
-; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
-; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
-; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
--- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@ -31,8 +31,8 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label %for.end, label %for.body
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[ADD_7]]
--- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@ -1,38 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s



 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
-; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
-; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
-; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
  %in.addr = getelementptr inbounds i32, i32* %in, i64 0