[Loop Vectorizer] Support predication of div/rem

div/rem instructions in basic blocks that require predication currently prevent vectorization. This patch extends the existing mechanism for predicating stores to handle other instructions and leverages it to predicate divs and rems. Differential Revision: https://reviews.llvm.org/D22918 llvm-svn: 279620
2024-11-23 03:02:36 +01:00 · 2016-08-24 11:37:57 +00:00 · 2016-08-24 11:37:57 +00:00 · 06e1775b78
commit 06e1775b78
parent f8172f3de2
4 changed files with 501 additions and 100 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -386,8 +386,9 @@ protected:
  /// See PR14725.
  void fixLCSSAPHIs();

-  /// Predicate conditional stores on their respective conditions.
-  void predicateStores();
+  /// Predicate conditional instructions that require predication on their
+  /// respective conditions.
+  void predicateInstructions();
 
  /// Shrinks vector element sizes based on information in "MinBWs".
  void truncateToMinimalBitwidths();
@ -414,11 +415,11 @@ protected:
  void updateAnalysis();

  /// This instruction is un-vectorizable. Implement it as a sequence
-  /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+  /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
  /// scalarized instruction behind an if block predicated on the control
  /// dependence of the instruction.
  virtual void scalarizeInstruction(Instruction *Instr,
-                                    bool IfPredicateStore = false);
+                                    bool IfPredicateInstr = false);

  /// Vectorize Load and Store instructions,
  virtual void vectorizeMemoryInstruction(Instruction *Instr);
@ -624,7 +625,7 @@ protected:

  /// Store instructions that should be predicated, as a pair
  ///   <StoreInst, Predicate>
-  SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
+  SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
  EdgeMaskCache MaskCache;
  /// Trip count of the original loop.
  Value *TripCount;
@ -654,7 +655,7 @@ public:

 private:
  void scalarizeInstruction(Instruction *Instr,
-                            bool IfPredicateStore = false) override;
+                            bool IfPredicateInstr = false) override;
  void vectorizeMemoryInstruction(Instruction *Instr) override;
  Value *getBroadcastInstrs(Value *V) override;
  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@ -2767,8 +2768,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
 }

 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
-                                               bool IfPredicateStore) {
+                                               bool IfPredicateInstr) {
  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  DEBUG(dbgs() << "LV: Scalarizing"
+               << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
+               << '\n');
  // Holds vector parameters or scalars, in case of uniform vals.
  SmallVector<VectorParts, 4> Params;

@ -2812,7 +2816,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);

  VectorParts Cond;
-  if (IfPredicateStore) {
+  if (IfPredicateInstr) {
    assert(Instr->getParent()->getSinglePredecessor() &&
           "Only support single predecessor blocks");
    Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
@ -2826,7 +2830,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,

      // Start if-block.
      Value *Cmp = nullptr;
-      if (IfPredicateStore) {
+      if (IfPredicateInstr) {
        Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
        Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
                                 ConstantInt::get(Cmp->getType(), 1));
@ -2865,9 +2869,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
        VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
                                                       Builder.getInt32(Width));
      // End if-block.
-      if (IfPredicateStore)
-        PredicatedStores.push_back(
-            std::make_pair(cast<StoreInst>(Cloned), Cmp));
+      if (IfPredicateInstr)
+        PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
    }
  }
 }
@ -3398,9 +3401,13 @@ static Value *addFastMathFlag(Value *V) {
  return V;
 }

-/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
-/// the result needs to be inserted and/or extracted from vectors.
+/// \brief Estimate the overhead of scalarizing a value based on its type.
+/// Insert and Extract are set if the result needs to be inserted and/or
+/// extracted from vectors.
+/// If the instruction is also to be predicated, add the cost of a PHI
+/// node to the insertion cost.
 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
+                                         bool Predicated,
                                         const TargetTransformInfo &TTI) {
  if (Ty->isVoidTy())
    return 0;
@ -3409,15 +3416,58 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
  unsigned Cost = 0;

  for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
-    if (Insert)
-      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
    if (Extract)
      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
+    if (Insert) {
+      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
+      if (Predicated)
+        Cost += TTI.getCFInstrCost(Instruction::PHI);
+    }
  }

+  // We assume that if-converted blocks have a 50% chance of being executed.
+  // Predicated scalarized instructions are avoided due to the CF that bypasses
+  // turned off lanes. The extracts and inserts will be sinked/hoisted to the
+  // predicated basic-block and are subjected to the same assumption.
+  if (Predicated)
+    Cost /= 2;
+
  return Cost;
 }

+/// \brief Estimate the overhead of scalarizing an Instruction based on the
+/// types of its operands and return value.
+static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
+                                         Type *RetTy, bool Predicated,
+                                         const TargetTransformInfo &TTI) {
+  unsigned ScalarizationCost =
+      getScalarizationOverhead(RetTy, true, false, Predicated, TTI);
+
+  for (Type *Ty : OpTys)
+    ScalarizationCost +=
+        getScalarizationOverhead(Ty, false, true, Predicated, TTI);
+
+  return ScalarizationCost;
+}
+
+/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// convenience wrapper for the type-based getScalarizationOverhead API.
+static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
+                                         bool Predicated,
+                                         const TargetTransformInfo &TTI) {
+  if (VF == 1)
+    return 0;
+
+  Type *RetTy = ToVectorTy(I->getType(), VF);
+
+  SmallVector<Type *, 4> OpTys;
+  unsigned OperandsNum = I->getNumOperands();
+  for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
+    OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
+
+  return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI);
+}
+
 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
 // Return the cost of the instruction, including scalarization overhead if it's
 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
@ -3448,10 +3498,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,

  // Compute costs of unpacking argument values for the scalar calls and
  // packing the return values to a vector.
-  unsigned ScalarizationCost =
-      getScalarizationOverhead(RetTy, true, false, TTI);
-  for (Type *Ty : Tys)
-    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI);

  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

@ -3871,7 +3918,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
  // Make sure DomTree is updated.
  updateAnalysis();

-  predicateStores();
+  predicateInstructions();

  // Remove redundant induction instructions.
  cse(LoopVectorBody);
@ -4038,17 +4085,128 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
                            LoopMiddleBlock);
  }
 }
- 
-void InnerLoopVectorizer::predicateStores() {
-  for (auto KV : PredicatedStores) {
+
+void InnerLoopVectorizer::predicateInstructions() {
+
+  // For each instruction I marked for predication on value C, split I into its
+  // own basic block to form an if-then construct over C.
+  // Since I may be fed by extractelement and/or be feeding an insertelement
+  // generated during scalarization we try to move such instructions into the
+  // predicated basic block as well. For the insertelement this also means that
+  // the PHI will be created for the resulting vector rather than for the
+  // scalar instruction.
+  // So for some predicated instruction, e.g. the conditional sdiv in:
+  //
+  // for.body:
+  //  ...
+  //  %add = add nsw i32 %mul, %0
+  //  %cmp5 = icmp sgt i32 %2, 7
+  //  br i1 %cmp5, label %if.then, label %if.end
+  //
+  // if.then:
+  //  %div = sdiv i32 %0, %1
+  //  br label %if.end
+  //
+  // if.end:
+  //  %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
+  //
+  // the sdiv at this point is scalarized and if-converted using a select.
+  // The inactive elements in the vector are not used, but the predicated
+  // instruction is still executed for all vector elements, essentially:
+  //
+  // vector.body:
+  //  ...
+  //  %17 = add nsw <2 x i32> %16, %wide.load
+  //  %29 = extractelement <2 x i32> %wide.load, i32 0
+  //  %30 = extractelement <2 x i32> %wide.load51, i32 0
+  //  %31 = sdiv i32 %29, %30
+  //  %32 = insertelement <2 x i32> undef, i32 %31, i32 0
+  //  %35 = extractelement <2 x i32> %wide.load, i32 1
+  //  %36 = extractelement <2 x i32> %wide.load51, i32 1
+  //  %37 = sdiv i32 %35, %36
+  //  %38 = insertelement <2 x i32> %32, i32 %37, i32 1
+  //  %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
+  //
+  // Predication will now re-introduce the original control flow to avoid false
+  // side-effects by the sdiv instructions on the inactive elements, yielding
+  // (after cleanup):
+  //
+  // vector.body:
+  //  ...
+  //  %5 = add nsw <2 x i32> %4, %wide.load
+  //  %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
+  //  %9 = extractelement <2 x i1> %8, i32 0
+  //  br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
+  //
+  // pred.sdiv.if:
+  //  %10 = extractelement <2 x i32> %wide.load, i32 0
+  //  %11 = extractelement <2 x i32> %wide.load51, i32 0
+  //  %12 = sdiv i32 %10, %11
+  //  %13 = insertelement <2 x i32> undef, i32 %12, i32 0
+  //  br label %pred.sdiv.continue
+  //
+  // pred.sdiv.continue:
+  //  %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
+  //  %15 = extractelement <2 x i1> %8, i32 1
+  //  br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
+  //
+  // pred.sdiv.if54:
+  //  %16 = extractelement <2 x i32> %wide.load, i32 1
+  //  %17 = extractelement <2 x i32> %wide.load51, i32 1
+  //  %18 = sdiv i32 %16, %17
+  //  %19 = insertelement <2 x i32> %14, i32 %18, i32 1
+  //  br label %pred.sdiv.continue55
+  //
+  // pred.sdiv.continue55:
+  //  %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
+  //  %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
+
+  for (auto KV : PredicatedInstructions) {
    BasicBlock::iterator I(KV.first);
-    auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
+    BasicBlock *Head = I->getParent();
+    auto *BB = SplitBlock(Head, &*std::next(I), DT, LI);
    auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
                                        /*BranchWeights=*/nullptr, DT, LI);
    I->moveBefore(T);
-    I->getParent()->setName("pred.store.if");
-    BB->setName("pred.store.continue");
+    // Try to move any extractelement we may have created for the predicated
+    // instruction into the Then block.
+    for (Use &Op : I->operands()) {
+      auto *OpInst = dyn_cast<ExtractElementInst>(&*Op);
+      if (OpInst && OpInst->hasOneUse()) // TODO: more accurately - hasOneUser()
+        OpInst->moveBefore(&*I);
+    }
+
+    I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
+    BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");
+
+    // If the instruction is non-void create a Phi node at reconvergence point.
+    if (!I->getType()->isVoidTy()) {
+      Value *IncomingTrue = nullptr;
+      Value *IncomingFalse = nullptr;
+
+      if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
+        // If the predicated instruction is feeding an insert-element, move it
+        // into the Then block; Phi node will be created for the vector.
+        InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
+        IEI->moveBefore(T);
+        IncomingTrue = IEI; // the new vector with the inserted element.
+        IncomingFalse = IEI->getOperand(0); // the unmodified vector
+      } else {
+        // Phi node will be created for the scalar predicated instruction.
+        IncomingTrue = &*I;
+        IncomingFalse = UndefValue::get(I->getType());
+      }
+
+      BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
+      assert(PostDom && "Then block has multiple successors");
+      PHINode *Phi =
+          PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
+      IncomingTrue->replaceAllUsesWith(Phi);
+      Phi->addIncoming(IncomingFalse, Head);
+      Phi->addIncoming(IncomingTrue, I->getParent());
+    }
  }
+
  DEBUG(DT->verifyDomTree());
 }

@ -4235,6 +4393,24 @@ void InnerLoopVectorizer::widenPHIInstruction(
  }
 }

+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+  assert((I.getOpcode() == Instruction::UDiv ||
+          I.getOpcode() == Instruction::SDiv ||
+          I.getOpcode() == Instruction::URem ||
+          I.getOpcode() == Instruction::SRem) &&
+         "Unexpected instruction");
+  Value *Divisor = I.getOperand(1);
+  auto *CInt = dyn_cast<ConstantInt>(Divisor);
+  return !CInt || CInt->isZero();
+}
+
 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
  // For each instruction in the old loop.
  for (Instruction &I : *BB) {
@ -4251,17 +4427,23 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
      continue;
    } // End of PHI.

+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      // Scalarize with predication if this instruction may divide by zero and
+      // block execution is conditional, otherwise fallthrough.
+      if (mayDivideByZero(I) && Legal->blockNeedsPredication(I.getParent())) {
+        scalarizeInstruction(&I, true);
+        continue;
+      }
    case Instruction::Add:
    case Instruction::FAdd:
    case Instruction::Sub:
    case Instruction::FSub:
    case Instruction::Mul:
    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
    case Instruction::FRem:
    case Instruction::Shl:
    case Instruction::LShr:
@ -5155,17 +5337,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
    }
    if (I.mayThrow())
      return false;
-
-    // The instructions below can trap.
-    switch (I.getOpcode()) {
-    default:
-      continue;
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-      return false;
-    }
  }

  return true;
@ -6082,17 +6253,24 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
    // TODO: IF-converted IFs become selects.
    return 0;
  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    // We assume that if-converted blocks have a 50% chance of being executed.
+    // Predicated scalarized instructions are avoided due to the CF that
+    // bypasses turned off lanes. If we are not predicating, fallthrough.
+    if (VF > 1 && mayDivideByZero(*I) &&
+        Legal->blockNeedsPredication(I->getParent()))
+      return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) / 2 +
+             getScalarizationOverhead(I, VF, true, TTI);
  case Instruction::Add:
  case Instruction::FAdd:
  case Instruction::Sub:
  case Instruction::FSub:
  case Instruction::Mul:
  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
@ -6328,28 +6506,11 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
      return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
    return CallCost;
  }
-  default: {
-    // We are scalarizing the instruction. Return the cost of the scalar
-    // instruction, plus the cost of insert and extract into vector
-    // elements, times the vector width.
-    unsigned Cost = 0;
-
-    if (!RetTy->isVoidTy() && VF != 1) {
-      unsigned InsCost =
-          TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy);
-      unsigned ExtCost =
-          TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy);
-
-      // The cost of inserting the results plus extracting each one of the
-      // operands.
-      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
-    }
-
+  default:
    // The cost of executing VF copies of the scalar instruction. This opcode
    // is unknown. Assume that it is the same as 'mul'.
-    Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
-    return Cost;
-  }
+    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+           getScalarizationOverhead(I, VF, false, TTI);
  } // end of switch.
 }

@ -6407,7 +6568,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
 }

 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
-                                             bool IfPredicateStore) {
+                                             bool IfPredicateInstr) {
  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
  // Holds vector parameters or scalars, in case of uniform vals.
  SmallVector<VectorParts, 4> Params;
@ -6450,7 +6611,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);

  VectorParts Cond;
-  if (IfPredicateStore) {
+  if (IfPredicateInstr) {
    assert(Instr->getParent()->getSinglePredecessor() &&
           "Only support single predecessor blocks");
    Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
@ -6463,7 +6624,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,

    // Start an "if (pred) a[i] = ..." block.
    Value *Cmp = nullptr;
-    if (IfPredicateStore) {
+    if (IfPredicateInstr) {
      if (Cond[Part]->getType()->isVectorTy())
        Cond[Part] =
            Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
@ -6494,16 +6655,16 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
      VecResults[Part] = Cloned;

    // End if-block.
-    if (IfPredicateStore)
-      PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp));
+    if (IfPredicateInstr)
+      PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
  }
 }

 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
  auto *SI = dyn_cast<StoreInst>(Instr);
-  bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+  bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));

-  return scalarizeInstruction(Instr, IfPredicateStore);
+  return scalarizeInstruction(Instr, IfPredicateInstr);
 }

 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
--- a/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/test/Transforms/LoopVectorize/if-pred-non-void.ll
@ -0,0 +1,173 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test predication of non-void instructions, specifically (i) that these
+; instructions permit vectorization and (ii) the creation of an insertelement
+; and a Phi node. We check the full 2-element sequence for the first
+; instruction; For the rest we'll just make sure they get predicated based
+; on the code generated for the first element.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK:   %[[SDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[SDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEE]], true
+; CHECK:   br i1 %[[SDCC]], label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]]
+; CHECK: [[CSD]]:
+; CHECK:   %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]]
+; CHECK:   %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0
+; CHECK:   br label %[[ESD]]
+; CHECK: [[ESD]]:
+; CHECK:   %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ]
+; CHECK:   %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1
+; CHECK:   %[[SDCCH:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEEH]], true
+; CHECK:   br i1 %[[SDCCH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]]
+; CHECK: [[CSDH]]:
+; CHECK:   %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]]
+; CHECK:   %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1
+; CHECK:   br label %[[ESDH]]
+; CHECK: [[ESDH]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ]
+
+; CHECK:   %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[UDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UDEE]], true
+; CHECK:   br i1 %[[UDCC]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]]
+; CHECK: [[CUD]]:
+; CHECK:   %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]]
+; CHECK:   %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0
+; CHECK:   br label %[[EUD]]
+; CHECK: [[EUD]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ]
+
+; CHECK:   %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[SRCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SREE]], true
+; CHECK:   br i1 %[[SRCC]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]]
+; CHECK: [[CSR]]:
+; CHECK:   %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]]
+; CHECK:   %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0
+; CHECK:   br label %[[ESR]]
+; CHECK: [[ESR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ]
+
+; CHECK:   %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   %[[URCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UREE]], true
+; CHECK:   br i1 %[[URCC]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]]
+; CHECK: [[CUR]]:
+; CHECK:   %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]]
+; CHECK:   %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0
+; CHECK:   br label %[[EUR]]
+; CHECK: [[EUR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, %lsd
+  %rud = udiv i32 %pud, %lud
+  %rsr = srem i32 %psr, %lsr
+  %rur = urem i32 %pur, %lur
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Future-use test for predication under smarter scalar-scalar: this test will
+; fail when the vectorizer starts feeding scalarized values directly to their
+; scalar users, i.e. w/o generating redundant insertelement/extractelement
+; instructions. This case is already supported by the predication code (which
+; should generate a phi for the scalar predicated value rather than for the
+; insertelement), but cannot be tested yet.
+; If you got this test to fail, kindly fix the test by using the alternative
+; FFU sequence. This will make the test check how we handle this case from
+; now on.
+define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test_scalar2scalar
+; CHECK: vector.body:
+; CHECK:   br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK:   %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:   %[[PDV:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[PD]], i32 0
+; CHECK:   br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK:   %[[PH:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[PDV]], %[[THEN]] ]
+; FFU-LABEL: test_scalar2scalar
+; FFU:   vector.body:
+; FFU:     br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; FFU:   [[THEN]]:
+; FFU:     %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; FFU:     br label %[[FI]]
+; FFU:   [[FI]]:
+; FFU:     %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+  %lsd.b = load i32, i32* %isd.b, align 4
+  %psd = add nsw i32 %lsd, 23
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %sd1 = sdiv i32 %psd, %lsd
+  %rsd = sdiv i32 %lsd.b, %sd1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
--- a/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
+++ b/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
@ -0,0 +1,90 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test no-predication of instructions that are provably safe, e.g. dividing by
+; a non-zero constant.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur,
+                  i32* nocapture %asd0, i32* nocapture %aud0,
+                  i32* nocapture %asr0, i32* nocapture %aur0
+) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 11, i32 11>
+; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 13, i32 13>
+; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 17, i32 17>
+; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 19, i32 19>
+; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 0, i32 0>
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %isd0 = getelementptr inbounds i32, i32* %asd0, i64 %indvars.iv
+  %iud0 = getelementptr inbounds i32, i32* %aud0, i64 %indvars.iv
+  %isr0 = getelementptr inbounds i32, i32* %asr0, i64 %indvars.iv
+  %iur0 = getelementptr inbounds i32, i32* %aur0, i64 %indvars.iv
+  %lsd0 = load i32, i32* %isd0, align 4
+  %lud0 = load i32, i32* %iud0, align 4
+  %lsr0 = load i32, i32* %isr0, align 4
+  %lur0 = load i32, i32* %iur0, align 4
+  %psd0 = add nsw i32 %lsd, 27
+  %pud0 = add nsw i32 %lud, 28
+  %psr0 = add nsw i32 %lsr, 29
+  %pur0 = add nsw i32 %lur, 30
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, 11
+  %rud = udiv i32 %pud, 13
+  %rsr = srem i32 %psr, 17
+  %rur = urem i32 %pur, 19
+  %rsd0 = sdiv i32 %psd0, 0
+  %rud0 = udiv i32 %pud0, 0
+  %rsr0 = srem i32 %psr0, 0
+  %rur0 = urem i32 %pur0, 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  %ysd0.0 = phi i32 [ %rsd0, %if.then ], [ %psd0, %for.body ]
+  %yud0.0 = phi i32 [ %rud0, %if.then ], [ %pud0, %for.body ]
+  %ysr0.0 = phi i32 [ %rsr0, %if.then ], [ %psr0, %for.body ]
+  %yur0.0 = phi i32 [ %rur0, %if.then ], [ %pur0, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  store i32 %ysd0.0, i32* %isd0, align 4
+  store i32 %yud0.0, i32* %iud0, align 4
+  store i32 %ysr0.0, i32* %isr0, align 4
+  store i32 %yur0.0, i32* %iur0, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
--- a/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/if-pred-stores.ll
@ -1,7 +1,6 @@
 ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
 ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
 ; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC

 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@ -17,49 +16,27 @@ entry:
 ; VEC:   %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
 ; VEC:   %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0
 ; VEC:   %[[v12:.+]] = icmp eq i1 %[[v11]], true
-; VEC:   %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
-; VEC:   %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
 ; VEC:   br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
 ;
 ; VEC: [[cond]]:
+; VEC:   %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC:   %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
 ; VEC:   store i32 %[[v13]], i32* %[[v14]], align 4
 ; VEC:   br label %[[else:.+]]
 ;
 ; VEC: [[else]]:
 ; VEC:   %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1
 ; VEC:   %[[v16:.+]] = icmp eq i1 %[[v15]], true
-; VEC:   %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
-; VEC:   %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
 ; VEC:   br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
 ;
 ; VEC: [[cond2]]:
+; VEC:   %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC:   %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
 ; VEC:   store i32 %[[v17]], i32* %[[v18]], align 4
 ; VEC:   br label %[[else2:.+]]
 ;
 ; VEC: [[else2]]:

-; VEC-IC-LABEL: test
-; VEC-IC:   %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
-; VEC-IC:   %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
-; VEC-IC:   %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0
-; VEC-IC:   br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]]
-;
-; VEC-IC: [[cond]]:
-; VEC-IC:   %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0
-; VEC-IC:   store i32 %[[v4]], i32* %{{.*}}, align 4
-; VEC-IC:   br label %[[else:.+]]
-;
-; VEC-IC: [[else]]:
-; VEC-IC:   %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1
-; VEC-IC:   br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]]
-;
-; VEC-IC: [[cond2]]:
-; VEC-IC:   %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1
-; VEC-IC:   store i32 %[[v6]], i32* %{{.*}}, align 4
-; VEC-IC:   br label %[[else2:.+]]
-;
-; VEC-IC: [[else2]]:
-
 ; UNROLL-LABEL: test
 ; UNROLL: vector.body:
 ; UNROLL:   %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0