[Loop Vectorizer] Handling loops FP induction variables.

Allowed loop vectorization with secondary FP IVs. Like this: float *A; float x = init; for (int i=0; i < N; ++i) { A[i] = x; x -= fp_inc; } The auto-vectorization is possible when the induction binary operator is "fast" or the function has "unsafe" attribute. Differential Revision: https://reviews.llvm.org/D21330 llvm-svn: 276554
2024-11-23 03:02:36 +01:00 · 2016-07-24 07:24:54 +00:00 · 2016-07-24 07:24:54 +00:00 · 6936839d54
commit 6936839d54
parent e1009d96f5
6 changed files with 561 additions and 56 deletions
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@ -263,13 +263,15 @@ public:
  enum InductionKind {
    IK_NoInduction,  ///< Not an induction variable.
    IK_IntInduction, ///< Integer induction variable. Step = C.
-    IK_PtrInduction  ///< Pointer induction var. Step = C / sizeof(elem).
+    IK_PtrInduction, ///< Pointer induction var. Step = C / sizeof(elem).
+    IK_FpInduction   ///< Floating point induction variable.
  };

 public:
  /// Default constructor - creates an invalid induction.
  InductionDescriptor()
-      : StartValue(nullptr), IK(IK_NoInduction), Step(nullptr) {}
+    : StartValue(nullptr), IK(IK_NoInduction), Step(nullptr),
+    InductionBinOp(nullptr) {}

  /// Get the consecutive direction. Returns:
  ///   0 - unknown or non-consecutive.
@ -291,26 +293,58 @@ public:
  const SCEV *getStep() const { return Step; }
  ConstantInt *getConstIntStepValue() const;

-  /// Returns true if \p Phi is an induction. If \p Phi is an induction,
-  /// the induction descriptor \p D will contain the data describing this
-  /// induction. If by some other means the caller has a better SCEV
+  /// Returns true if \p Phi is an induction in the loop \p L. If \p Phi is an
+  /// induction, the induction descriptor \p D will contain the data describing
+  /// this induction. If by some other means the caller has a better SCEV
  /// expression for \p Phi than the one returned by the ScalarEvolution
  /// analysis, it can be passed through \p Expr.
-  static bool isInductionPHI(PHINode *Phi, ScalarEvolution *SE,
+  static bool isInductionPHI(PHINode *Phi, const Loop* L, ScalarEvolution *SE,
                             InductionDescriptor &D,
                             const SCEV *Expr = nullptr);

-  /// Returns true if \p Phi is an induction, in the context associated with
-  /// the run-time predicate of PSE. If \p Assume is true, this can add further
-  /// SCEV predicates to \p PSE in order to prove that \p Phi is an induction.
+  /// Returns true if \p Phi is a floating point induction in the loop \p L.
+  /// If \p Phi is an induction, the induction descriptor \p D will contain 
+  /// the data describing this induction.
+  static bool isFPInductionPHI(PHINode *Phi, const Loop* L,
+                               ScalarEvolution *SE, InductionDescriptor &D);
+
+  /// Returns true if \p Phi is a loop \p L induction, in the context associated
+  /// with the run-time predicate of PSE. If \p Assume is true, this can add
+  /// further SCEV predicates to \p PSE in order to prove that \p Phi is an
+  /// induction.
  /// If \p Phi is an induction, \p D will contain the data describing this
  /// induction.
-  static bool isInductionPHI(PHINode *Phi, PredicatedScalarEvolution &PSE,
+  static bool isInductionPHI(PHINode *Phi, const Loop* L,
+                             PredicatedScalarEvolution &PSE,
                             InductionDescriptor &D, bool Assume = false);

+  /// Returns true if the induction type is FP and the binary operator does
+  /// not have the "fast-math" property. Such operation requires a relaxed FP
+  /// mode.
+  bool hasUnsafeAlgebra() {
+    return InductionBinOp &&
+      !cast<FPMathOperator>(InductionBinOp)->hasUnsafeAlgebra();
+  }
+
+  /// Returns induction operator that does not have "fast-math" property
+  /// and requires FP unsafe mode.
+  Instruction *getUnsafeAlgebraInst() {
+    if (!InductionBinOp ||
+        cast<FPMathOperator>(InductionBinOp)->hasUnsafeAlgebra())
+      return nullptr;
+    return InductionBinOp;
+  }
+
+  /// Returns binary opcode of the induction operator.
+  Instruction::BinaryOps getInductionOpcode() const {
+    return InductionBinOp ? InductionBinOp->getOpcode() :
+      Instruction::BinaryOpsEnd;
+  }
+
 private:
  /// Private constructor - used by \c isInductionPHI.
-  InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step);
+  InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step,
+                      BinaryOperator *InductionBinOp = nullptr);

  /// Start value.
  TrackingVH<Value> StartValue;
@ -318,6 +352,8 @@ private:
  InductionKind IK;
  /// Step value.
  const SCEV *Step;
+  // Instruction that advances induction variable.
+  BinaryOperator *InductionBinOp;
 };

 BasicBlock *InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@ -703,7 +703,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
    RecurrenceDescriptor RD;
    InductionDescriptor ID;
    PHINode *PHI = cast<PHINode>(I);
-    if (InductionDescriptor::isInductionPHI(PHI, SE, ID))
+    if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
      Inductions.push_back(PHI);
    else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
      Reductions.push_back(PHI);
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@ -654,8 +654,8 @@ Value *RecurrenceDescriptor::createMinMaxOp(IRBuilder<> &Builder,
 }

 InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,
-                                         const SCEV *Step)
-  : StartValue(Start), IK(K), Step(Step) {
+                                         const SCEV *Step, BinaryOperator *BOp)
+  : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) {
  assert(IK != IK_NoInduction && "Not an induction");

  // Start value type should match the induction kind and the value
@ -672,7 +672,15 @@ InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K,

  assert((IK != IK_PtrInduction || getConstIntStepValue()) &&
         "Step value should be constant for pointer induction");
-  assert(Step->getType()->isIntegerTy() && "StepValue is not an integer");
+  assert((IK == IK_FpInduction || Step->getType()->isIntegerTy()) &&
+         "StepValue is not an integer");
+
+  assert((IK != IK_FpInduction || Step->getType()->isFloatingPointTy()) &&
+         "StepValue is not FP for FpInduction");
+  assert((IK != IK_FpInduction || (InductionBinOp &&
+          (InductionBinOp->getOpcode() == Instruction::FAdd ||
+           InductionBinOp->getOpcode() == Instruction::FSub))) &&
+         "Binary opcode should be specified for FP induction");
 }

 int InductionDescriptor::getConsecutiveDirection() const {
@ -693,6 +701,8 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
                                      const DataLayout& DL) const {

  SCEVExpander Exp(*SE, DL, "induction");
+  assert(Index->getType() == Step->getType() &&
+         "Index type does not match StepValue type");
  switch (IK) {
  case IK_IntInduction: {
    assert(Index->getType() == StartValue->getType() &&
@ -717,29 +727,113 @@ Value *InductionDescriptor::transform(IRBuilder<> &B, Value *Index,
    return Exp.expandCodeFor(S, StartValue->getType(), &*B.GetInsertPoint());
  }
  case IK_PtrInduction: {
-    assert(Index->getType() == Step->getType() &&
-           "Index type does not match StepValue type");
    assert(isa<SCEVConstant>(Step) &&
           "Expected constant step for pointer induction");
    const SCEV *S = SE->getMulExpr(SE->getSCEV(Index), Step);
    Index = Exp.expandCodeFor(S, Index->getType(), &*B.GetInsertPoint());
    return B.CreateGEP(nullptr, StartValue, Index);
  }
+  case IK_FpInduction: {
+    assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+    assert(InductionBinOp &&
+           (InductionBinOp->getOpcode() == Instruction::FAdd ||
+            InductionBinOp->getOpcode() == Instruction::FSub) &&
+           "Original bin op should be defined for FP induction");
+
+    Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+    // Floating point operations had to be 'fast' to enable the induction.
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+
+    Value *MulExp = B.CreateFMul(StepValue, Index);
+    if (isa<Instruction>(MulExp))
+      // We have to check, the MulExp may be a constant.
+      cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+    Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode() , StartValue,
+                               MulExp, "induction");
+    if (isa<Instruction>(BOp))
+      cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+    return BOp;
+  }
  case IK_NoInduction:
    return nullptr;
  }
  llvm_unreachable("invalid enum");
 }

-bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+bool InductionDescriptor::isFPInductionPHI(PHINode *Phi, const Loop *TheLoop,
+                                           ScalarEvolution *SE,
+                                           InductionDescriptor &D) {
+
+  // Here we only handle FP induction variables.
+  assert(Phi->getType()->isFloatingPointTy() && "Unexpected Phi type");
+
+  if (TheLoop->getHeader() != Phi->getParent())
+    return false;
+
+  // The loop may have multiple entrances or multiple exits; we can analyze
+  // this phi if it has a unique entry value and a unique backedge value.
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+  Value *BEValue = nullptr, *StartValue = nullptr;
+  if (TheLoop->contains(Phi->getIncomingBlock(0))) {
+    BEValue = Phi->getIncomingValue(0);
+    StartValue = Phi->getIncomingValue(1);
+  } else {
+    assert(TheLoop->contains(Phi->getIncomingBlock(1)) &&
+           "Unexpected Phi node in the loop"); 
+    BEValue = Phi->getIncomingValue(1);
+    StartValue = Phi->getIncomingValue(0);
+  }
+
+  BinaryOperator *BOp = dyn_cast<BinaryOperator>(BEValue);
+  if (!BOp)
+    return false;
+
+  Value *Addend = nullptr;
+  if (BOp->getOpcode() == Instruction::FAdd) {
+    if (BOp->getOperand(0) == Phi)
+      Addend = BOp->getOperand(1);
+    else if (BOp->getOperand(1) == Phi)
+      Addend = BOp->getOperand(0);
+  } else if (BOp->getOpcode() == Instruction::FSub)
+    if (BOp->getOperand(0) == Phi)
+      Addend = BOp->getOperand(1);
+
+  if (!Addend)
+    return false;
+
+  // The addend should be loop invariant
+  if (auto *I = dyn_cast<Instruction>(Addend))
+    if (TheLoop->contains(I))
+      return false;
+
+  // FP Step has unknown SCEV
+  const SCEV *Step = SE->getUnknown(Addend);
+  D = InductionDescriptor(StartValue, IK_FpInduction, Step, BOp);
+  return true;
+}
+
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
                                         PredicatedScalarEvolution &PSE,
                                         InductionDescriptor &D,
                                         bool Assume) {
  Type *PhiTy = Phi->getType();
-  // We only handle integer and pointer inductions variables.
-  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+
+  // Handle integer and pointer inductions variables.
+  // Now we handle also FP induction but not trying to make a
+  // recurrent expression from the PHI node in-place.
+
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy() &&
+      !PhiTy->isFloatTy() && !PhiTy->isDoubleTy() && !PhiTy->isHalfTy())
    return false;

+  if (PhiTy->isFloatingPointTy())
+    return isFPInductionPHI(Phi, TheLoop, PSE.getSE(), D);
+
  const SCEV *PhiScev = PSE.getSCEV(Phi);
  const auto *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);

@ -752,10 +846,10 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
    return false;
  }

-  return isInductionPHI(Phi, PSE.getSE(), D, AR);
+  return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR);
 }

-bool InductionDescriptor::isInductionPHI(PHINode *Phi,
+bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
                                         ScalarEvolution *SE,
                                         InductionDescriptor &D,
                                         const SCEV *Expr) {
@ -773,7 +867,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
    return false;
  }

-  assert(AR->getLoop()->getHeader() == Phi->getParent() &&
+  assert(TheLoop->getHeader() == Phi->getParent() &&
         "PHI is an AddRec for a different loop?!");
  Value *StartValue =
    Phi->getIncomingValueForBlock(AR->getLoop()->getLoopPreheader());
@ -781,7 +875,7 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi,
  // Calculate the pointer stride and check if it is consecutive.
  // The stride may be a constant or a loop invariant integer value.
  const SCEVConstant *ConstStep = dyn_cast<SCEVConstant>(Step);
-  if (!ConstStep && !SE->isLoopInvariant(Step, AR->getLoop()))
+  if (!ConstStep && !SE->isLoopInvariant(Step, TheLoop))
    return false;

  if (PhiTy->isIntegerTy()) {
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -402,7 +402,10 @@ protected:

  /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
  /// to each vector element of Val. The sequence starts at StartIndex.
-  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
+  /// \p Opcode is relevant for FP induction variable.
+  virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+                               Instruction::BinaryOps Opcode =
+                               Instruction::BinaryOpsEnd);

  /// Compute scalar induction steps. \p ScalarIV is the scalar induction
  /// variable on which to base the steps, \p Step is the size of the step, and
@ -625,7 +628,9 @@ private:
                            bool IfPredicateStore = false) override;
  void vectorizeMemoryInstruction(Instruction *Instr) override;
  Value *getBroadcastInstrs(Value *V) override;
-  Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
+  Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+                       Instruction::BinaryOps Opcode =
+                       Instruction::BinaryOpsEnd) override;
  Value *reverseVector(Value *Vec) override;
 };

@ -2000,32 +2005,60 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
  }
 }

-Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
-                                          Value *Step) {
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+                                          Instruction::BinaryOps BinOp) {
+  // Create and check the types.
  assert(Val->getType()->isVectorTy() && "Must be a vector");
-  assert(Val->getType()->getScalarType()->isIntegerTy() &&
-         "Elem must be an integer");
-  assert(Step->getType() == Val->getType()->getScalarType() &&
-         "Step has wrong type");
-  // Create the types.
-  Type *ITy = Val->getType()->getScalarType();
-  VectorType *Ty = cast<VectorType>(Val->getType());
-  int VLen = Ty->getNumElements();
+  int VLen = Val->getType()->getVectorNumElements();
+
+  Type *STy = Val->getType()->getScalarType();
+  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+         "Induction Step must be an integer or FP");
+  assert(Step->getType() == STy && "Step has wrong type");
+
  SmallVector<Constant *, 8> Indices;

+  if (STy->isIntegerTy()) {
+    // Create a vector of consecutive numbers from zero to VF.
+    for (int i = 0; i < VLen; ++i)
+      Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+
+    // Add the consecutive indices to the vector value.
+    Constant *Cv = ConstantVector::get(Indices);
+    assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+    Step = Builder.CreateVectorSplat(VLen, Step);
+    assert(Step->getType() == Val->getType() && "Invalid step vec");
+    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+    // which can be found from the original scalar operations.
+    Step = Builder.CreateMul(Cv, Step);
+    return Builder.CreateAdd(Val, Step, "induction");
+  }
+
+  // Floating point induction.
+  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+         "Binary Opcode should be specified for FP induction");
  // Create a vector of consecutive numbers from zero to VF.
  for (int i = 0; i < VLen; ++i)
-    Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
+    Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));

  // Add the consecutive indices to the vector value.
  Constant *Cv = ConstantVector::get(Indices);
-  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+
  Step = Builder.CreateVectorSplat(VLen, Step);
-  assert(Step->getType() == Val->getType() && "Invalid step vec");
-  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
-  // which can be found from the original scalar operations.
-  Step = Builder.CreateMul(Cv, Step);
-  return Builder.CreateAdd(Val, Step, "induction");
+
+  // Floating point operations had to be 'fast' to enable the induction.
+  FastMathFlags Flags;
+  Flags.setUnsafeAlgebra();
+
+  Value *MulOp = Builder.CreateFMul(Cv, Step);
+  if (isa<Instruction>(MulOp))
+    // Have to check, MulOp may be a constant
+    cast<Instruction>(MulOp)->setFastMathFlags(Flags);
+
+  Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+  if (isa<Instruction>(BOp))
+    cast<Instruction>(BOp)->setFastMathFlags(Flags);
+  return BOp;
 }

 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
@ -3099,8 +3132,10 @@ void InnerLoopVectorizer::createEmptyLoop() {
      EndValue = CountRoundDown;
    } else {
      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
-      Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
-                                       II.getStep()->getType(), "cast.crd");
+      Type *StepType = II.getStep()->getType();
+      Instruction::CastOps CastOp = 
+        CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
+      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
      const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
      EndValue = II.transform(B, CRD, PSE.getSE(), DL);
      EndValue->setName("ind.end");
@ -4047,7 +4082,7 @@ void InnerLoopVectorizer::widenPHIInstruction(
    llvm_unreachable("Unknown induction");
  case InductionDescriptor::IK_IntInduction:
    return widenIntInduction(P, Entry);
-  case InductionDescriptor::IK_PtrInduction:
+  case InductionDescriptor::IK_PtrInduction: {
    // Handle the pointer induction variable case.
    assert(P->getType()->isPointerTy() && "Unexpected type.");
    // This is the normalized GEP that starts counting at zero.
@ -4080,6 +4115,29 @@ void InnerLoopVectorizer::widenPHIInstruction(
    }
    return;
  }
+  case InductionDescriptor::IK_FpInduction: {
+    assert(P->getType() == II.getStartValue()->getType() &&
+           "Types must match");
+    // Handle other induction variables that are now based on the
+    // canonical one.
+    assert(P != OldInduction && "Primary induction can be integer only");
+
+    Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
+    V = II.transform(Builder, V, PSE.getSE(), DL);
+    V->setName("fp.offset.idx");
+
+    // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
+
+    Value *Broadcasted = getBroadcastInstrs(V);
+    // After broadcasting the induction variable we need to make the vector
+    // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
+    Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
+    for (unsigned part = 0; part < UF; ++part)
+      Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
+                                  II.getInductionOpcode());
+    return;
+  }
+  }
 }

 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
@ -4565,10 +4623,12 @@ void LoopVectorizationLegality::addInductionPhi(
  const DataLayout &DL = Phi->getModule()->getDataLayout();

  // Get the widest type.
-  if (!WidestIndTy)
-    WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
-  else
-    WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+  if (!PhiTy->isFloatingPointTy()) {
+    if (!WidestIndTy)
+      WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+    else
+      WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+  }

  // Int inductions are special because we only allow one IV.
  if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
@ -4649,8 +4709,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
        }

        InductionDescriptor ID;
-        if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) {
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
          addInductionPhi(Phi, ID, AllowedExit);
+          if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+            Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
          continue;
        }

@ -4661,7 +4723,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {

        // As a last resort, coerce the PHI to a AddRec expression
        // and re-try classifying it a an induction PHI.
-        if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) {
+        if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
          addInductionPhi(Phi, ID, AllowedExit);
          continue;
        }
@ -6348,11 +6410,20 @@ Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }

 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

-Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+                                        Instruction::BinaryOps BinOp) {
  // When unrolling and the VF is 1, we only need to add a simple scalar.
-  Type *ITy = Val->getType();
-  assert(!ITy->isVectorTy() && "Val must be a scalar");
-  Constant *C = ConstantInt::get(ITy, StartIdx);
+  Type *Ty = Val->getType();
+  assert(!Ty->isVectorTy() && "Val must be a scalar");
+
+  if (Ty->isFloatingPointTy()) {
+    Constant *C = ConstantFP::get(Ty, (double)StartIdx);
+
+    // Floating point operations had to be 'fast' to enable the unrolling.
+    Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
+    return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+  }
+  Constant *C = ConstantInt::get(Ty, StartIdx);
  return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
 }

--- a/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@ -0,0 +1,86 @@
+; RUN: opt < %s  -O3 -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s
+
+; This test checks auto-vectorization with FP induction variable.
+; The FP operation is not "fast" and requires "fast-math" function attribute.
+
+;void fp_iv_loop1(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+
+; AUTO_VEC-LABEL: @fp_iv_loop1(
+; AUTO_VEC: vector.body
+; AUTO_VEC: store <8 x float>
+
+define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; The same as the previous, FP operation is not fast, different function attribute
+; Vectorization should be rejected.
+;void fp_iv_loop2(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; AUTO_VEC-LABEL: @fp_iv_loop2(
+; AUTO_VEC-NOT: vector.body
+; AUTO_VEC-NOT: store <{{.*}} x float>
+
+define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { "no-nans-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="false" }
--- a/test/Transforms/LoopVectorize/float-induction.ll
+++ b/test/Transforms/LoopVectorize/float-induction.ll
@ -0,0 +1,218 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop1(
+; VEC4_INTERL1:       %[[FP_INC:.*]] = load float, float* @fp_inc
+; VEC4_INTERL1: vector.body:
+; VEC4_INTERL1:       %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float
+; VEC4_INTERL1:       %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]]
+; VEC4_INTERL1:       %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]]
+; VEC4_INTERL1:       %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0
+; VEC4_INTERL1-NEXT:  %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1:       %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]]
+; VEC4_INTERL1-NEXT:  %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer
+; VEC4_INTERL1:       %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:  %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]]
+; VEC4_INTERL1:       store <4 x float> %[[VEC_INDUCTION]]
+
+; VEC4_INTERL2-LABEL: @fp_iv_loop1(
+; VEC4_INTERL2:       %[[FP_INC:.*]] = load float, float* @fp_inc
+; VEC4_INTERL2: vector.body:
+; VEC4_INTERL2:       %[[INDEX:.*]] = sitofp i64 {{.*}} to float
+; VEC4_INTERL2:       %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
+; VEC4_INTERL2:       fsub fast float %init, %[[VEC_INCR]]
+; VEC4_INTERL2:       %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:  %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]]
+; VEC4_INTERL2:       %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>
+; VEC4_INTERL2-NEXT:  %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]]
+; VEC4_INTERL2:       store <4 x float> %[[VEC_INDUCTION1]]
+; VEC4_INTERL2:       store <4 x float> %[[VEC_INDUCTION2]]
+
+; VEC1_INTERL2-LABEL: @fp_iv_loop1(
+; VEC1_INTERL2:       %[[FP_INC:.*]] = load float, float* @fp_inc
+; VEC1_INTERL2: vector.body:
+; VEC1_INTERL2:         %[[INDEX:.*]] = sitofp i64 {{.*}} to float
+; VEC1_INTERL2:         %[[STEP:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
+; VEC1_INTERL2:         %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[STEP]]
+; VEC1_INTERL2:         %[[SCALAR_INDUCTION2:.*]] = fsub fast float %[[FP_OFFSET_IDX]], %[[FP_INC]]
+; VEC1_INTERL2:         store float %[[FP_OFFSET_IDX]]
+; VEC1_INTERL2:         store float %[[SCALAR_INDUCTION2]]
+
+@fp_inc = common global float 0.000000e+00, align 4
+
+;void fp_iv_loop1(float init, float * __restrict__ A, int N) {
+;  float x = init;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x -= fp_inc;
+;  }
+;}
+
+define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %fpinc = load float, float* @fp_inc, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %x.05 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.05, float* %arrayidx, align 4
+  %add = fsub fast float %x.05, %fpinc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+;void fp_iv_loop2(float init, float * __restrict__ A, int N) {
+;  float x = init;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop2(
+; VEC4_INTERL1: vector.body
+; VEC4_INTERL1:  %[[index:.*]] = phi i64 [ 0, %vector.ph ]
+; VEC4_INTERL1: sitofp i64 %[[index]] to float
+; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01
+; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]]
+; VEC4_INTERL1:  insertelement <4 x float> undef, float %[[VAR2]], i32 0
+; VEC4_INTERL1:  shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1:  fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1:  store <4 x float> 
+
+define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ %init, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd fast float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+;void fp_iv_loop3(float init, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int N) {
+;  int i = 0;
+;  float x = init;
+;  float y = 0.1;
+;  for (; i < N; ++i) {
+;    A[i] = x;
+;    x += fp_inc;
+;    y -= 0.5;
+;    B[i] = x + y;
+;    C[i] = y;
+;  }
+;}
+; VEC4_INTERL1-LABEL: @fp_iv_loop3(
+; VEC4_INTERL1: vector.body
+; VEC4_INTERL1:  %[[index:.*]] = phi i64 [ 0, %vector.ph ]
+; VEC4_INTERL1: sitofp i64 %[[index]] to float
+; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01
+; VEC4_INTERL1:  fadd fast float %[[VAR1]]
+; VEC4_INTERL1:  fadd fast <4 x float> {{.*}}, <float -5.000000e-01, float -1.000000e+00, float -1.500000e+00, float -2.000000e+00>
+; VEC4_INTERL1:  store <4 x float>
+
+define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load float, float* @fp_inc, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %y.012 = phi float [ 0x3FB99999A0000000, %for.body.lr.ph ], [ %conv1, %for.body ]
+  %x.011 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.011, float* %arrayidx, align 4
+  %add = fadd fast float %x.011, %0
+  %conv1 = fadd fast float %y.012, -5.000000e-01
+  %add2 = fadd fast float %conv1, %add
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %add2, float* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %C, i64 %indvars.iv
+  store float %conv1, float* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 
+  br label %for.end
+
+for.end: 
+  ret void
+}
+
+; Start and step values are constants. There is no 'fmul' operation in this case
+;void fp_iv_loop4(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop4(
+; VEC4_INTERL1: vector.body
+; VEC4_INTERL1-NOT: fmul fast <4 x float>
+; VEC4_INTERL1:  %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1: store <4 x float> %[[induction]]
+
+define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd fast float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}