[SystemZ] TargetTransformInfo cost functions implemented.

getArithmeticInstrCost(), getShuffleCost(), getCastInstrCost(), getCmpSelInstrCost(), getVectorInstrCost(), getMemoryOpCost(), getInterleavedMemoryOpCost() implemented. Interleaved access vectorization enabled. BasicTTIImpl::getCastInstrCost() improved to check for legal extending loads, in which case the cost of the z/sext instruction becomes 0. Review: Ulrich Weigand, Renato Golin. https://reviews.llvm.org/D29631 llvm-svn: 300052
2024-10-19 02:52:53 +02:00 · 2017-04-12 11:49:08 +00:00 · 2017-04-12 11:49:08 +00:00 · 90b172efa0
commit 90b172efa0
parent cbd29b6ed5
34 changed files with 8899 additions and 104 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -572,8 +572,10 @@ public:
                     Type *SubTp = nullptr) const;

  /// \return The expected cost of cast instructions, such as bitcast, trunc,
-  /// zext, etc.
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const;
+  /// zext, etc. If there is an existing instruction that holds Opcode, it
+  /// may be passed in the 'I' parameter.
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr) const;

  /// \return The expected cost of a sign- or zero-extended vector extract. Use
  /// -1 to indicate that there is no information about the index value.
@ -584,9 +586,11 @@ public:
  /// Phi, Ret, Br.
  int getCFInstrCost(unsigned Opcode) const;

-  /// \returns The expected cost of compare and select instructions.
+  /// \returns The expected cost of compare and select instructions. If there
+  /// is an existing instruction that holds Opcode, it may be passed in the
+  /// 'I' parameter.
  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                         Type *CondTy = nullptr) const;
+                 Type *CondTy = nullptr, const Instruction *I = nullptr) const;

  /// \return The expected cost of vector Insert and Extract.
  /// Use -1 to indicate that there is no information on the index value.
@ -594,7 +598,7 @@ public:

  /// \return The cost of Load and Store instructions.
  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace) const;
+                      unsigned AddressSpace, const Instruction *I = nullptr) const;

  /// \return The cost of masked Load and Store instructions.
  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
@ -821,16 +825,17 @@ public:
                         ArrayRef<const Value *> Args) = 0;
  virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                             Type *SubTp) = 0;
-  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
+  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                               const Instruction *I) = 0;
  virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                       VectorType *VecTy, unsigned Index) = 0;
  virtual int getCFInstrCost(unsigned Opcode) = 0;
  virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                 Type *CondTy) = 0;
+                                Type *CondTy, const Instruction *I) = 0;
  virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
                                 unsigned Index) = 0;
  virtual int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                              unsigned AddressSpace) = 0;
+                              unsigned AddressSpace, const Instruction *I) = 0;
  virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                    unsigned Alignment,
                                    unsigned AddressSpace) = 0;
@ -1065,8 +1070,9 @@ public:
                     Type *SubTp) override {
    return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
  }
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) override {
-    return Impl.getCastInstrCost(Opcode, Dst, Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I) override {
+    return Impl.getCastInstrCost(Opcode, Dst, Src, I);
  }
  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                               unsigned Index) override {
@ -1075,15 +1081,16 @@ public:
  int getCFInstrCost(unsigned Opcode) override {
    return Impl.getCFInstrCost(Opcode);
  }
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) override {
-    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I) override {
+    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
  }
  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
    return Impl.getVectorInstrCost(Opcode, Val, Index);
  }
  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace) override {
-    return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+                      unsigned AddressSpace, const Instruction *I) override {
+    return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
  }
  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace) override {
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -334,7 +334,8 @@ public:
    return 1;
  }

-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return 1; }
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                            const Instruction *I) { return 1; }

  unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                    VectorType *VecTy, unsigned Index) {
@ -343,7 +344,8 @@ public:

  unsigned getCFInstrCost(unsigned Opcode) { return 1; }

-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              const Instruction *I) {
    return 1;
  }

@ -352,7 +354,7 @@ public:
  }

  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) {
+                           unsigned AddressSpace, const Instruction *I) {
    return 1;
  }

--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@ -308,8 +308,7 @@ public:

  /// Estimate the overhead of scalarizing an instructions unique
  /// non-constant operands. The types of the arguments are ordinarily
-  /// scalar, in which case the costs are multiplied with VF. Vector
-  /// arguments are allowed if 1 is passed for VF.
+  /// scalar, in which case the costs are multiplied with VF.
  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                            unsigned VF) {
    unsigned Cost = 0;
@ -318,8 +317,10 @@ public:
      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
        Type *VecTy = nullptr;
        if (A->getType()->isVectorTy()) {
-          assert (VF == 1 && "Vector argument passed with VF > 1");
          VecTy = A->getType();
+          // If A is a vector operand, VF should be 1 or correspond to A.
+          assert ((VF == 1 || VF == VecTy->getVectorNumElements()) &&
+                  "Vector argument does not match VF");
        }
        else
          VecTy = VectorType::get(A->getType(), VF);
@ -331,6 +332,23 @@ public:
    return Cost;
  }

+  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
+    assert (VecTy->isVectorTy());
+    
+    unsigned Cost = 0;
+
+    Cost += getScalarizationOverhead(VecTy, true, false);
+    if (!Args.empty())
+      Cost += getOperandsScalarizationOverhead(Args,
+                                               VecTy->getVectorNumElements());
+    else
+      // When no information on arguments is provided, we add the cost
+      // associated with one argument as a heuristic.
+      Cost += getScalarizationOverhead(VecTy, false, true);
+
+    return Cost;
+  }
+
  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }

  unsigned getArithmeticInstrCost(
@ -373,15 +391,7 @@ public:
                          ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
-      unsigned TotCost = getScalarizationOverhead(Ty, true, false) + Num * Cost;
-      if (!Args.empty())
-        TotCost += getOperandsScalarizationOverhead(Args, Num);
-      else
-        // When no information on arguments is provided, we add the cost
-        // associated with one argument as a heuristic.
-        TotCost += getScalarizationOverhead(Ty, false, true);
-
-      return TotCost;
+      return getScalarizationOverhead(Ty, Args) + Num * Cost;
    }

    // We don't know anything about this scalar instruction.
@ -397,7 +407,8 @@ public:
    return 1;
  }

-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                            const Instruction *I = nullptr) {
    const TargetLoweringBase *TLI = getTLI();
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");
@ -426,6 +437,18 @@ public:
                                 Dst->getPointerAddressSpace()))
      return 0;

+    // If this is a zext/sext of a load, return 0 if the corresponding
+    // extending load exists on target.
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        I && isa<LoadInst>(I->getOperand(0))) {
+        EVT ExtVT = EVT::getEVT(Dst);
+        EVT LoadVT = EVT::getEVT(Src);
+        unsigned LType =
+          ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
+        if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
+          return 0;
+    }
+
    // If the cast is marked as legal (or promote) then assume low cost.
    if (SrcLT.first == DstLT.first &&
        TLI->isOperationLegalOrPromote(ISD, DstLT.second))
@ -483,14 +506,14 @@ public:
                                         Src->getVectorNumElements() / 2);
        T *TTI = static_cast<T *>(this);
        return TTI->getVectorSplitCost() +
-               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc));
+               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
      }

      // In other cases where the source or destination are illegal, assume
      // the operation will get scalarized.
      unsigned Num = Dst->getVectorNumElements();
      unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
-          Opcode, Dst->getScalarType(), Src->getScalarType());
+          Opcode, Dst->getScalarType(), Src->getScalarType(), I);

      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
@ -524,7 +547,8 @@ public:
    return 0;
  }

-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              const Instruction *I) {
    const TargetLoweringBase *TLI = getTLI();
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");
@ -552,7 +576,7 @@ public:
      if (CondTy)
        CondTy = CondTy->getScalarType();
      unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
-          Opcode, ValTy->getScalarType(), CondTy);
+          Opcode, ValTy->getScalarType(), CondTy, I);

      // Return the cost of multiple scalar invocation plus the cost of
      // inserting and extracting the values.
@ -571,7 +595,7 @@ public:
  }

  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) {
+                       unsigned AddressSpace, const Instruction *I = nullptr) {
    assert(!Src->isVoidTy() && "Invalid type");
    std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);

--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@ -447,25 +447,25 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
  case Instruction::Select: {
    const SelectInst *SI = cast<SelectInst>(I);
    Type *CondTy = SI->getCondition()->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
  }
  case Instruction::ICmp:
  case Instruction::FCmp: {
    Type *ValTy = I->getOperand(0)->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
  }
  case Instruction::Store: {
    const StoreInst *SI = cast<StoreInst>(I);
    Type *ValTy = SI->getValueOperand()->getType();
    return TTI->getMemoryOpCost(I->getOpcode(), ValTy,
                                SI->getAlignment(),
-                                 SI->getPointerAddressSpace());
+                                SI->getPointerAddressSpace(), I);
  }
  case Instruction::Load: {
    const LoadInst *LI = cast<LoadInst>(I);
    return TTI->getMemoryOpCost(I->getOpcode(), I->getType(),
                                LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
+                                LI->getPointerAddressSpace(), I);
  }
  case Instruction::ZExt:
  case Instruction::SExt:
@ -481,7 +481,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
  case Instruction::BitCast:
  case Instruction::AddrSpaceCast: {
    Type *SrcTy = I->getOperand(0)->getType();
-    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I);
  }
  case Instruction::ExtractElement: {
    const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -314,8 +314,10 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
 }

 int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                          Type *Src) const {
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+                                 Type *Src, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
@ -335,8 +337,10 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
 }

 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) const {
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+                                 Type *CondTy, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
@ -350,8 +354,11 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,

 int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
                                         unsigned Alignment,
-                                         unsigned AddressSpace) const {
-  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+                                         unsigned AddressSpace,
+                                         const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -176,7 +176,8 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
  return TTI::PSK_Software;
 }

-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode");

@ -436,7 +437,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 }

 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy) {
+                                       Type *CondTy, const Instruction *I) {

  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  // We don't lower some vector selects well that are wider than the register
@ -463,11 +464,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
        return Entry->Cost;
    }
  }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }

 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
-                                    unsigned Alignment, unsigned AddressSpace) {
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
  auto LT = TLI->getTypeLegalizationCost(DL, Ty);

  if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -86,7 +86,8 @@ public:

  unsigned getMaxInterleaveFactor(unsigned VF);

-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);

  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                               unsigned Index);
@ -103,10 +104,11 @@ public:

  int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);

-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);

  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);

  int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);

--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -92,7 +92,8 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 }


-int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode");

@ -310,7 +311,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }

-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {

  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  // On NEON a a vector select gets lowered to vbsl.
@ -335,7 +337,7 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
    return LT.first;
  }

-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }

 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@ -504,7 +506,7 @@ int ARMTTIImpl::getArithmeticInstrCost(
 }

 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);

  if (Src->isVectorTy() && Alignment != 16 &&
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@ -94,9 +94,11 @@ public:

  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);

-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);

-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);

  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);

@ -114,7 +116,7 @@ public:
      ArrayRef<const Value *> Args = ArrayRef<const Value *>());

  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);

  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                 ArrayRef<unsigned> Indices, unsigned Alignment,
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@ -302,14 +302,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
  return LT.first;
 }

-int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");

  return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }

-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }

 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@ -352,7 +354,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 }

 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
  // Legalize the type.
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@ -74,11 +74,13 @@ public:
      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                 unsigned Factor,
                                 ArrayRef<unsigned> Indices,
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@ -347,9 +347,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
    // There should be no need to check for float types other than v2f64
    // since <2 x f32> isn't a legal type.
    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
  }

  // Handle floating-point types.
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
        }
      }
      if (isa<StoreInst>(&I)) {
-        NumStores++;
        Type *MemAccessTy = I.getOperand(0)->getType();
-        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
-           (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
-          NumStores++;  // 128 bit fp/int stores get split.
+        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
      }
    }

@ -313,3 +310,547 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
  return 0;
 }

+int SystemZTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
+
+  // TODO: return a good value for BB-VECTORIZER that includes the
+  // immediate loads, which we do not want to count for the loop
+  // vectorizer, since they are hopefully hoisted out of the loop. This
+  // would require a new parameter 'InLoop', but not sure if constant
+  // args are common enough to motivate this.
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+  if (Ty->isVectorTy()) {
+    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    unsigned VF = Ty->getVectorNumElements();
+    unsigned NumVectors = getNumberOfParts(Ty);
+
+    // These vector operations are custom handled, but are still supported
+    // with one instruction per vector, regardless of element size.
+    if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+        Opcode == Instruction::AShr) {
+      return NumVectors;
+    }
+
+    // These FP operations are supported with a single vector instruction for
+    // double (base implementation assumes float generally costs 2). For
+    // FP128, the scalar cost is 1, and there is no overhead since the values
+    // are already in scalar registers.
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+      switch (ScalarBits) {
+      case 32: {
+        // Return the cost of multiple scalar invocation plus the cost of
+        // inserting and extracting the values.
+        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        // FIXME: VF 2 for these FP operations are currently just as
+        // expensive as for VF 4.
+        if (VF == 2)
+          Cost *= 2;
+        return Cost;
+      }
+      case 64:
+      case 128:
+        return NumVectors;
+      default:
+        break;
+      }
+    }
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem) {
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+      if (VF == 2 && ScalarBits == 32)
+        Cost *= 2;
+      return Cost;
+    }
+  }
+  else {  // Scalar:
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1)
+      // 2 * ipm sequences ; xor ; shift ; compare
+      return 7;
+
+    // An extra extension for narrow types is needed.
+    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+      // sext of op(s) for narrow types
+      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+
+    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
+      return (ScalarBits < 32 ? 4 : 2);
+  }
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  assert (Tp->isVectorTy());
+  assert (ST->hasVector() && "getShuffleCost() called.");
+  unsigned NumVectors = getNumberOfParts(Tp);
+  
+  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+
+  // FP128 values are always in scalar registers, so there is no work
+  // involved with a shuffle, except for broadcast. In that case register
+  // moves are done with a single instruction per element.
+  if (Tp->getScalarType()->isFP128Ty())
+    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+
+  switch (Kind) {
+  case  TargetTransformInfo::SK_ExtractSubvector:
+    // ExtractSubvector Index indicates start offset.
+
+    // Extracting a subvector from first index is a noop.
+    return (Index == 0 ? 0 : NumVectors);
+
+  case TargetTransformInfo::SK_Broadcast:
+    // Loop vectorizer calls here to figure out the extra cost of
+    // broadcasting a loaded value to all elements of a vector. Since vlrep
+    // loads and replicates with a single instruction, adjust the returned
+    // value.
+    return NumVectors - 1;
+
+  default:
+
+    // SystemZ supports single instruction permutation / replication.
+    return NumVectors;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+// Return the log2 difference of the element sizes of the two vector types.
+static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
+  unsigned Bits0 = Ty0->getScalarSizeInBits();
+  unsigned Bits1 = Ty1->getScalarSizeInBits();
+
+  if (Bits1 >  Bits0)
+    return (Log2_32(Bits1) - Log2_32(Bits0));
+
+  return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+// Return the number of instructions needed to truncate SrcTy to DstTy.
+unsigned SystemZTTIImpl::
+getVectorTruncCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
+  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
+          "Packing must reduce size of vector type.");
+  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
+          "Packing should not change number of elements.");
+
+  // TODO: Since fp32 is expanded, the extract cost should always be 0.
+
+  unsigned NumParts = getNumberOfParts(SrcTy);
+  if (NumParts <= 2)
+    // Up to 2 vector registers can be truncated efficiently with pack or
+    // permute. The latter requires an immediate mask to be loaded, which
+    // typically gets hoisted out of a loop.  TODO: return a good value for
+    // BB-VECTORIZER that includes the immediate loads, which we do not want
+    // to count for the loop vectorizer.
+    return 1;
+
+  unsigned Cost = 0;
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  unsigned VF = SrcTy->getVectorNumElements();
+  for (unsigned P = 0; P < Log2Diff; ++P) {
+    if (NumParts > 1)
+      NumParts /= 2;
+    Cost += NumParts;
+  }
+
+  // Currently, a general mix of permutes and pack instructions is output by
+  // isel, which follow the cost computation above except for this case which
+  // is one instruction less:
+  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
+      DstTy->getScalarSizeInBits() == 8)
+    Cost--;
+
+  return Cost;
+}
+
+// Return the cost of converting a vector bitmask produced by a compare
+// (SrcTy), to the type of the select or extend instruction (DstTy).
+unsigned SystemZTTIImpl::
+getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
+          "Should only be called with vector types.");
+
+  unsigned PackCost = 0;
+  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
+  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  if (SrcScalarBits > DstScalarBits)
+    // The bitmask will be truncated.
+    PackCost = getVectorTruncCost(SrcTy, DstTy);
+  else if (SrcScalarBits < DstScalarBits) {
+    unsigned DstNumParts = getNumberOfParts(DstTy);
+    // Each vector select needs its part of the bitmask unpacked.
+    PackCost = Log2Diff * DstNumParts;
+    // Extra cost for moving part of mask before unpacking.
+    PackCost += DstNumParts - 1;
+  }
+
+  return PackCost;
+}
+
+// Return the type of the compared operands. This is needed to compute the
+// cost for a Select / ZExt or SExt instruction.
+static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
+  Type *OpTy = nullptr;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+    OpTy = CI->getOperand(0)->getType();
+  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
+    if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
+      if (isa<CmpInst>(LogicI->getOperand(1)))
+        OpTy = CI0->getOperand(0)->getType();
+
+  if (OpTy != nullptr) {
+    if (VF == 1) {
+      assert (!OpTy->isVectorTy() && "Expected scalar type");
+      return OpTy;
+    }
+    // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
+    // be either scalar or already vectorized with a same or lesser VF.
+    Type *ElTy = OpTy->getScalarType();
+    return VectorType::get(ElTy, VF);
+  }
+
+  return nullptr;
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
+  unsigned DstScalarBits = Dst->getScalarSizeInBits();
+  unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+  if (Src->isVectorTy()) {
+    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+    assert (Dst->isVectorTy());
+    unsigned VF = Src->getVectorNumElements();
+    unsigned NumDstVectors = getNumberOfParts(Dst);
+    unsigned NumSrcVectors = getNumberOfParts(Src);
+
+    if (Opcode == Instruction::Trunc) {
+      if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
+        return 0; // Check for NOOP conversions.
+      return getVectorTruncCost(Src, Dst);
+    }
+
+    if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      if (SrcScalarBits >= 8) {
+        // ZExt/SExt will be handled with one unpack per doubling of width.
+        unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
+
+        // For types that spans multiple vector registers, some additional
+        // instructions are used to setup the unpacking.
+        unsigned NumSrcVectorOps =
+          (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
+                          : (NumDstVectors / 2));
+
+        return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
+      }
+      else if (SrcScalarBits == 1) {
+        // This should be extension of a compare i1 result.
+        // If we know what the widths of the compared operands, get the
+        // cost of converting it to Dst. Otherwise assume same widths.
+        unsigned Cost = 0;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+        if (CmpOpTy != nullptr)
+          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+        if (Opcode == Instruction::ZExt)
+          // One 'vn' per dst vector with an immediate mask.
+          Cost += NumDstVectors;
+        return Cost;
+      }
+    }
+  
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+        Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+      // TODO: Fix base implementation which could simplify things a bit here
+      // (seems to miss on differentiating on scalar/vector types).
+
+      // Only 64 bit vector conversions are natively supported.
+      if (SrcScalarBits == 64 && DstScalarBits == 64)
+        return NumDstVectors;
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values. Base implementation does not
+      // realize float->int gets scalarized.
+      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+      unsigned TotCost = VF * ScalarCost;
+      bool NeedsInserts = true, NeedsExtracts = true;
+      // FP128 registers do not get inserted or extracted.
+      if (DstScalarBits == 128 &&
+          (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+        NeedsInserts = false;
+      if (SrcScalarBits == 128 &&
+          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+        NeedsExtracts = false;
+
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+        TotCost *= 2;
+
+      return TotCost;
+    }
+
+    if (Opcode == Instruction::FPTrunc) {
+      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
+        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+      else // double -> float
+        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+    }
+
+    if (Opcode == Instruction::FPExt) {
+      if (SrcScalarBits == 32 && DstScalarBits == 64) {
+        // float -> double is very rare and currently unoptimized. Instead of
+        // using vldeb, which can do two at a time, all conversions are
+        // scalarized.
+        return VF * 2;
+      }
+      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
+      return VF + getScalarizationOverhead(Src, false, true);
+    }
+  }
+  else { // Scalar
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+    
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+
+      return Cost;
+    }
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                       const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+    assert (CondTy == nullptr || CondTy->isVectorTy());
+    unsigned VF = ValTy->getVectorNumElements();
+
+    // Called with a compare instruction.
+    if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+      unsigned PredicateExtraCost = 0;
+      if (I != nullptr) {
+        // Some predicates cost one or two extra instructions.
+        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        case CmpInst::Predicate::ICMP_NE:
+        case CmpInst::Predicate::ICMP_UGE:
+        case CmpInst::Predicate::ICMP_ULE:
+        case CmpInst::Predicate::ICMP_SGE:
+        case CmpInst::Predicate::ICMP_SLE:
+          PredicateExtraCost = 1;
+          break;
+        case CmpInst::Predicate::FCMP_ONE:
+        case CmpInst::Predicate::FCMP_ORD:
+        case CmpInst::Predicate::FCMP_UEQ:
+        case CmpInst::Predicate::FCMP_UNO:
+          PredicateExtraCost = 2;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
+      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+      unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+
+      unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
+      return Cost;
+    }
+    else { // Called with a select instruction.
+      assert (Opcode == Instruction::Select);
+
+      // We can figure out the extra cost of packing / unpacking if the
+      // instruction was passed and the compare instruction is found.
+      unsigned PackCost = 0;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+      if (CmpOpTy != nullptr)
+        PackCost =
+          getVectorBitmaskConversionCost(CmpOpTy, ValTy);
+
+      return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+    }
+  }
+  else { // Scalar
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += 2; // extend both operands
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP, so this costs a conditional jump.
+      return 1; // Load On Condition.
+    }
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
+
+int SystemZTTIImpl::
+getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  // vlvgp will insert two grs into a vector register, so only count half the
+  // number of instructions.
+  if (Opcode == Instruction::InsertElement &&
+      Val->getScalarType()->isIntegerTy(64))
+    return ((Index % 2 == 0) ? 1 : 0);
+
+  if (Opcode == Instruction::ExtractElement) {
+    int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+
+    // Give a slight penalty for moving out of vector pipeline to FXU unit.
+    if (Index == 0 && Val->getScalarType()->isIntegerTy())
+      Cost += 1;
+
+    return Cost;
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
+  assert(!Src->isVoidTy() && "Invalid type");
+
+  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
+      I != nullptr && I->hasOneUse()) {
+      const Instruction *UserI = cast<Instruction>(*I->user_begin());
+      unsigned Bits = Src->getScalarSizeInBits();
+      bool FoldsLoad = false;
+      switch (UserI->getOpcode()) {
+      case Instruction::ICmp:
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      // This also makes sense for float operations, but disabled for now due
+      // to regressions.
+      // case Instruction::FCmp:
+      // case Instruction::FAdd:
+      // case Instruction::FSub:
+      // case Instruction::FMul:
+      // case Instruction::FDiv:
+        FoldsLoad = (Bits == 32 || Bits == 64);
+        break;
+      }
+
+      if (FoldsLoad) {
+        assert (UserI->getNumOperands() == 2 &&
+                "Expected to only handle binops.");
+
+        // UserI can't fold two loads, so in that case return 0 cost only
+        // half of the time.
+        for (unsigned i = 0; i < 2; ++i) {
+          if (UserI->getOperand(i) == I)
+            continue;
+          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
+            if (LI->hasOneUse())
+              return i == 0;
+          }
+        }
+
+        return 0;
+      }
+  }
+
+  unsigned NumOps = getNumberOfParts(Src);
+
+  if (Src->getScalarSizeInBits() == 128)
+    // 128 bit scalars are held in a pair of two 64 bit registers.
+    NumOps *= 2;
+
+  return  NumOps;
+}
+
+int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
+     (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
+  assert (WideBits > 0 && "Could not compute size of vector");
+  int NumWideParts =
+    ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+
+  // How many source vectors are handled to produce a vectorized operand?
+  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
+  int NumSrcParts =
+    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+
+  // A Load group may have gaps.
+  unsigned NumOperands =
+    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+
+  // Each needed permute takes two vectors as input.
+  if (NumSrcParts > 1)
+    NumSrcParts--;
+  int NumPermutes = NumSrcParts * NumOperands;
+
+  // Cost of load/store operations and the permutations needed.
+  return NumWideParts + NumPermutes;
+}
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
  const SystemZSubtarget *getST() const { return ST; }
  const SystemZTargetLowering *getTLI() const { return TLI; }

+  unsigned const LIBCALL_COST = 30;
+
 public:
  explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@ -53,6 +55,31 @@ public:
  unsigned getNumberOfRegisters(bool Vector);
  unsigned getRegisterBitWidth(bool Vector);

+  bool enableInterleavedAccessVectorization() { return true; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
+  unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace, const Instruction *I = nullptr);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
  /// @}
 };

--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -938,7 +938,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }

-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  assert(ISD && "Invalid opcode");

@ -1304,7 +1305,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
  return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }

-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
  // Legalize the type.
  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);

@ -1370,7 +1372,7 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
      return LT.first * Entry->Cost;

-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }

 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
@ -1615,7 +1617,7 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 }

 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
  // Handle non-power-of-two vectors such as <3 x float>
  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
    unsigned NumElem = VTy->getVectorNumElements();
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@ -62,11 +62,13 @@ public:
      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace);
  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@ -1882,7 +1882,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
             "non noop cast is found during rematerialization");

      Type *SrcTy = CI->getOperand(0)->getType();
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);

    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
      // Cost of the address calculation
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@ -550,7 +550,8 @@ namespace {
                          TargetTransformInfo::OperandValueKind Op1VK =
                              TargetTransformInfo::OK_AnyValue,
                          TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue) {
+                              TargetTransformInfo::OK_AnyValue,
+                          const Instruction *I = nullptr) {
      switch (Opcode) {
      default: break;
      case Instruction::GetElementPtr:
@ -584,7 +585,7 @@ namespace {
      case Instruction::Select:
      case Instruction::ICmp:
      case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
+        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
      case Instruction::ZExt:
      case Instruction::SExt:
      case Instruction::FPToUI:
@ -598,7 +599,7 @@ namespace {
      case Instruction::FPTrunc:
      case Instruction::BitCast:
      case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2);
+        return TTI->getCastInstrCost(Opcode, T1, T2, I);
      }

      return 1;
@ -1044,14 +1045,14 @@ namespace {
        return false;
      }
    } else if (TTI) {
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
      TargetTransformInfo::OperandValueKind Op1VK =
          TargetTransformInfo::OK_AnyValue;
      TargetTransformInfo::OperandValueKind Op2VK =
          TargetTransformInfo::OK_AnyValue;
+      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
+      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
+      Type *VT1 = getVecTypeForPair(IT1, JT1),
+           *VT2 = getVecTypeForPair(IT2, JT2);

      // On some targets (example X86) the cost of a vector shift may vary
      // depending on whether the second operand is a Uniform or
@ -1090,7 +1091,7 @@ namespace {
      // but this cost is ignored (because insert and extract element
      // instructions are assigned a zero depth factor and are not really
      // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
+      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);

      if (VCost > ICost + JCost)
        return false;
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -7048,7 +7048,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

  Cost += VF *
          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS);
+                              AS, I);

  // Get the overhead of the extractelement and insertelement instructions
  // we might create due to scalarization.
@ -7078,7 +7078,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
  if (Legal->isMaskRequired(I))
    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
  else
-    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

  bool Reverse = ConsecutiveStride < 0;
  if (Reverse)
@ -7154,7 +7154,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
    unsigned AS = getMemInstAlignment(I);

    return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS);
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
  }
  return getWideningCost(I, VF);
 }
@ -7369,7 +7369,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
    if (!ScalarCond)
      CondTy = VectorType::get(CondTy, VF);

-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
  }
  case Instruction::ICmp:
  case Instruction::FCmp: {
@ -7378,7 +7378,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
      ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
    VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
  }
  case Instruction::Store:
  case Instruction::Load: {
@ -7403,7 +7403,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
    if (isOptimizableIVTruncate(I, VF)) {
      auto *Trunc = cast<TruncInst>(I);
      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
-                                  Trunc->getSrcTy());
+                                  Trunc->getSrcTy(), Trunc);
    }

    Type *SrcScalarTy = I->getOperand(0)->getType();
@ -7427,7 +7427,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
      }
    }

-    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
  }
  case Instruction::Call: {
    bool NeedToScalarize;
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -1762,10 +1762,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {

      // Calculate the cost of this instruction.
      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
+                                                         VL0->getType(), SrcTy, VL0);

      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
      return VecCost - ScalarCost;
    }
    case Instruction::FCmp:
@ -1774,8 +1774,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
      // Calculate the cost of this instruction.
      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
      int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
      return VecCost - ScalarCost;
    }
    case Instruction::Add:
@ -1858,18 +1858,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
      // Cost of wide load - cost of scalar loads.
      unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
      int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
      return VecLdCost - ScalarLdCost;
    }
    case Instruction::Store: {
      // We know that we can merge the stores. Calculate the cost.
      unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
      int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
      int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
      return VecStCost - ScalarStCost;
    }
    case Instruction::Call: {
--- a/test/Analysis/CostModel/SystemZ/cmp-ext.ll
+++ b/test/Analysis/CostModel/SystemZ/cmp-ext.ll
--- a/test/Analysis/CostModel/SystemZ/cmpsel.ll
+++ b/test/Analysis/CostModel/SystemZ/cmpsel.ll
--- a/test/Analysis/CostModel/SystemZ/ext-load.ll
+++ b/test/Analysis/CostModel/SystemZ/ext-load.ll
@ -0,0 +1,56 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Test that an extension of a load does not get an additional cost in cases
+; where the load performs the extension.
+
+define void @sext() {
+  %li8 = load i8, i8* undef
+  sext i8 %li8 to i16
+  sext i8 %li8 to i32
+  sext i8 %li8 to i64
+
+  %li16 = load i16, i16* undef
+  sext i16 %li16 to i32
+  sext i16 %li16 to i64
+
+  %li32 = load i32, i32* undef
+  sext i32 %li32 to i64
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li8 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %1 = sext i8 %li8 to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %2 = sext i8 %li8 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %3 = sext i8 %li8 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %4 = sext i16 %li16 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %5 = sext i16 %li16 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %6 = sext i32 %li32 to i64
+}
+
+define void @zext() {
+  %li8 = load i8, i8* undef
+  zext i8 %li8 to i16
+  zext i8 %li8 to i32
+  zext i8 %li8 to i64
+
+  %li16 = load i16, i16* undef
+  zext i16 %li16 to i32
+  zext i16 %li16 to i64
+
+  %li32 = load i32, i32* undef
+  zext i32 %li32 to i64
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li8 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %1 = zext i8 %li8 to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %2 = zext i8 %li8 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %3 = zext i8 %li8 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %4 = zext i16 %li16 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %5 = zext i16 %li16 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %6 = zext i32 %li32 to i64
+}
--- a/test/Analysis/CostModel/SystemZ/fp-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-arith.ll
@ -0,0 +1,119 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: The scalarized vector instructions cost is not including any
+; extracts, due to the undef operands
+;
+; Note: FRem is implemented with libcall, so not included here.
+
+define void @fadd() {
+  %res0 = fadd float undef, undef
+  %res1 = fadd double undef, undef
+  %res2 = fadd fp128 undef, undef
+  %res3 = fadd <2 x float> undef, undef
+  %res4 = fadd <2 x double> undef, undef
+  %res5 = fadd <4 x float> undef, undef
+  %res6 = fadd <4 x double> undef, undef
+  %res7 = fadd <8 x float> undef, undef
+  %res8 = fadd <8 x double> undef, undef
+  %res9 = fadd <16 x float> undef, undef
+  %res10 = fadd <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fadd float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fadd double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fadd fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fadd <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fadd <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fadd <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fadd <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fadd <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fadd <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fadd <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fadd <16 x double> undef, undef
+
+  ret void;
+}
+
+define void @fsub() {
+  %res0 = fsub float undef, undef
+  %res1 = fsub double undef, undef
+  %res2 = fsub fp128 undef, undef
+  %res3 = fsub <2 x float> undef, undef
+  %res4 = fsub <2 x double> undef, undef
+  %res5 = fsub <4 x float> undef, undef
+  %res6 = fsub <4 x double> undef, undef
+  %res7 = fsub <8 x float> undef, undef
+  %res8 = fsub <8 x double> undef, undef
+  %res9 = fsub <16 x float> undef, undef
+  %res10 = fsub <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fsub float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fsub double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fsub fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fsub <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fsub <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fsub <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fsub <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fsub <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fsub <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fsub <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fsub <16 x double> undef, undef
+
+  ret void;
+}
+
+define void @fmul() {
+  %res0 = fmul float undef, undef
+  %res1 = fmul double undef, undef
+  %res2 = fmul fp128 undef, undef
+  %res3 = fmul <2 x float> undef, undef
+  %res4 = fmul <2 x double> undef, undef
+  %res5 = fmul <4 x float> undef, undef
+  %res6 = fmul <4 x double> undef, undef
+  %res7 = fmul <8 x float> undef, undef
+  %res8 = fmul <8 x double> undef, undef
+  %res9 = fmul <16 x float> undef, undef
+  %res10 = fmul <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fmul float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fmul double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fmul fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fmul <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fmul <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fmul <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fmul <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fmul <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fmul <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fmul <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fmul <16 x double> undef, undef
+
+  ret void;
+}
+
+define void @fdiv() {
+  %res0 = fdiv float undef, undef
+  %res1 = fdiv double undef, undef
+  %res2 = fdiv fp128 undef, undef
+  %res3 = fdiv <2 x float> undef, undef
+  %res4 = fdiv <2 x double> undef, undef
+  %res5 = fdiv <4 x float> undef, undef
+  %res6 = fdiv <4 x double> undef, undef
+  %res7 = fdiv <8 x float> undef, undef
+  %res8 = fdiv <8 x double> undef, undef
+  %res9 = fdiv <16 x float> undef, undef
+  %res10 = fdiv <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fdiv float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fdiv double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fdiv fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fdiv <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fdiv <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fdiv <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fdiv <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fdiv <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fdiv <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fdiv <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fdiv <16 x double> undef, undef
+
+  ret void;
+}
+
--- a/test/Analysis/CostModel/SystemZ/fp-cast.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-cast.ll
@ -0,0 +1,541 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: The scalarized vector instructions costs are not including any
+; extracts, due to the undef operands.
+
+define void @fpext() {
+  %v0 = fpext double undef to fp128
+  %v1 = fpext float undef to fp128
+  %v2 = fpext float undef to double
+  %v3 = fpext <2 x double> undef to <2 x fp128>
+  %v4 = fpext <2 x float> undef to <2 x fp128>
+  %v5 = fpext <2 x float> undef to <2 x double>
+  %v6 = fpext <4 x double> undef to <4 x fp128>
+  %v7 = fpext <4 x float> undef to <4 x fp128>
+  %v8 = fpext <4 x float> undef to <4 x double>
+  %v9 = fpext <8 x double> undef to <8 x fp128>
+  %v10 = fpext <8 x float> undef to <8 x fp128>
+  %v11 = fpext <8 x float> undef to <8 x double>
+  %v12 = fpext <16 x float> undef to <16 x double>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fpext double undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fpext float undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fpext float undef to double
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v3 = fpext <2 x double> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v4 = fpext <2 x float> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v5 = fpext <2 x float> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v6 = fpext <4 x double> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v7 = fpext <4 x float> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v8 = fpext <4 x float> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v9 = fpext <8 x double> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v10 = fpext <8 x float> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v11 = fpext <8 x float> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v12 = fpext <16 x float> undef to <16 x double>
+
+  ret void;
+}
+
+define void @fptosi() {
+  %v0 = fptosi fp128 undef to i64
+  %v1 = fptosi fp128 undef to i32
+  %v2 = fptosi fp128 undef to i16
+  %v3 = fptosi fp128 undef to i8
+  %v4 = fptosi double undef to i64
+  %v5 = fptosi double undef to i32
+  %v6 = fptosi double undef to i16
+  %v7 = fptosi double undef to i8
+  %v8 = fptosi float undef to i64
+  %v9 = fptosi float undef to i32
+  %v10 = fptosi float undef to i16
+  %v11 = fptosi float undef to i8
+  %v12 = fptosi <2 x fp128> undef to <2 x i64>
+  %v13 = fptosi <2 x fp128> undef to <2 x i32>
+  %v14 = fptosi <2 x fp128> undef to <2 x i16>
+  %v15 = fptosi <2 x fp128> undef to <2 x i8>
+  %v16 = fptosi <2 x double> undef to <2 x i64>
+  %v17 = fptosi <2 x double> undef to <2 x i32>
+  %v18 = fptosi <2 x double> undef to <2 x i16>
+  %v19 = fptosi <2 x double> undef to <2 x i8>
+  %v20 = fptosi <2 x float> undef to <2 x i64>
+  %v21 = fptosi <2 x float> undef to <2 x i32>
+  %v22 = fptosi <2 x float> undef to <2 x i16>
+  %v23 = fptosi <2 x float> undef to <2 x i8>
+  %v24 = fptosi <4 x fp128> undef to <4 x i64>
+  %v25 = fptosi <4 x fp128> undef to <4 x i32>
+  %v26 = fptosi <4 x fp128> undef to <4 x i16>
+  %v27 = fptosi <4 x fp128> undef to <4 x i8>
+  %v28 = fptosi <4 x double> undef to <4 x i64>
+  %v29 = fptosi <4 x double> undef to <4 x i32>
+  %v30 = fptosi <4 x double> undef to <4 x i16>
+  %v31 = fptosi <4 x double> undef to <4 x i8>
+  %v32 = fptosi <4 x float> undef to <4 x i64>
+  %v33 = fptosi <4 x float> undef to <4 x i32>
+  %v34 = fptosi <4 x float> undef to <4 x i16>
+  %v35 = fptosi <4 x float> undef to <4 x i8>
+  %v36 = fptosi <8 x fp128> undef to <8 x i64>
+  %v37 = fptosi <8 x fp128> undef to <8 x i32>
+  %v38 = fptosi <8 x fp128> undef to <8 x i16>
+  %v39 = fptosi <8 x fp128> undef to <8 x i8>
+  %v40 = fptosi <8 x double> undef to <8 x i64>
+  %v41 = fptosi <8 x double> undef to <8 x i32>
+  %v42 = fptosi <8 x double> undef to <8 x i16>
+  %v43 = fptosi <8 x double> undef to <8 x i8>
+  %v44 = fptosi <8 x float> undef to <8 x i64>
+  %v45 = fptosi <8 x float> undef to <8 x i32>
+  %v46 = fptosi <8 x float> undef to <8 x i16>
+  %v47 = fptosi <8 x float> undef to <8 x i8>
+  %v48 = fptosi <16 x double> undef to <16 x i64>
+  %v49 = fptosi <16 x double> undef to <16 x i32>
+  %v50 = fptosi <16 x double> undef to <16 x i16>
+  %v51 = fptosi <16 x double> undef to <16 x i8>
+  %v52 = fptosi <16 x float> undef to <16 x i64>
+  %v53 = fptosi <16 x float> undef to <16 x i32>
+  %v54 = fptosi <16 x float> undef to <16 x i16>
+  %v55 = fptosi <16 x float> undef to <16 x i8>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fptosi fp128 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fptosi fp128 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fptosi fp128 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = fptosi fp128 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = fptosi double undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = fptosi double undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = fptosi double undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = fptosi double undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = fptosi float undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = fptosi float undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = fptosi float undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = fptosi float undef to i8
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v12 = fptosi <2 x fp128> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v13 = fptosi <2 x fp128> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v14 = fptosi <2 x fp128> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = fptosi <2 x fp128> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = fptosi <2 x double> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v17 = fptosi <2 x double> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = fptosi <2 x double> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v19 = fptosi <2 x double> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v20 = fptosi <2 x float> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v21 = fptosi <2 x float> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v22 = fptosi <2 x float> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = fptosi <2 x float> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v24 = fptosi <4 x fp128> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v25 = fptosi <4 x fp128> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v26 = fptosi <4 x fp128> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = fptosi <4 x fp128> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v28 = fptosi <4 x double> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = fptosi <4 x double> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = fptosi <4 x double> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v31 = fptosi <4 x double> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %v32 = fptosi <4 x float> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = fptosi <4 x float> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v34 = fptosi <4 x float> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v35 = fptosi <4 x float> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v36 = fptosi <8 x fp128> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v37 = fptosi <8 x fp128> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v38 = fptosi <8 x fp128> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = fptosi <8 x fp128> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v40 = fptosi <8 x double> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = fptosi <8 x double> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = fptosi <8 x double> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v43 = fptosi <8 x double> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %v44 = fptosi <8 x float> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = fptosi <8 x float> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v46 = fptosi <8 x float> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v47 = fptosi <8 x float> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = fptosi <16 x double> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = fptosi <16 x double> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = fptosi <16 x double> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = fptosi <16 x double> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %v52 = fptosi <16 x float> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v53 = fptosi <16 x float> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v54 = fptosi <16 x float> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v55 = fptosi <16 x float> undef to <16 x i8>
+
+  ret void;
+}
+
+
+define void @fptoui() {
+  %v0 = fptoui fp128 undef to i64
+  %v1 = fptoui fp128 undef to i32
+  %v2 = fptoui fp128 undef to i16
+  %v3 = fptoui fp128 undef to i8
+  %v4 = fptoui double undef to i64
+  %v5 = fptoui double undef to i32
+  %v6 = fptoui double undef to i16
+  %v7 = fptoui double undef to i8
+  %v8 = fptoui float undef to i64
+  %v9 = fptoui float undef to i32
+  %v10 = fptoui float undef to i16
+  %v11 = fptoui float undef to i8
+  %v12 = fptoui <2 x fp128> undef to <2 x i64>
+  %v13 = fptoui <2 x fp128> undef to <2 x i32>
+  %v14 = fptoui <2 x fp128> undef to <2 x i16>
+  %v15 = fptoui <2 x fp128> undef to <2 x i8>
+  %v16 = fptoui <2 x double> undef to <2 x i64>
+  %v17 = fptoui <2 x double> undef to <2 x i32>
+  %v18 = fptoui <2 x double> undef to <2 x i16>
+  %v19 = fptoui <2 x double> undef to <2 x i8>
+  %v20 = fptoui <2 x float> undef to <2 x i64>
+  %v21 = fptoui <2 x float> undef to <2 x i32>
+  %v22 = fptoui <2 x float> undef to <2 x i16>
+  %v23 = fptoui <2 x float> undef to <2 x i8>
+  %v24 = fptoui <4 x fp128> undef to <4 x i64>
+  %v25 = fptoui <4 x fp128> undef to <4 x i32>
+  %v26 = fptoui <4 x fp128> undef to <4 x i16>
+  %v27 = fptoui <4 x fp128> undef to <4 x i8>
+  %v28 = fptoui <4 x double> undef to <4 x i64>
+  %v29 = fptoui <4 x double> undef to <4 x i32>
+  %v30 = fptoui <4 x double> undef to <4 x i16>
+  %v31 = fptoui <4 x double> undef to <4 x i8>
+  %v32 = fptoui <4 x float> undef to <4 x i64>
+  %v33 = fptoui <4 x float> undef to <4 x i32>
+  %v34 = fptoui <4 x float> undef to <4 x i16>
+  %v35 = fptoui <4 x float> undef to <4 x i8>
+  %v36 = fptoui <8 x fp128> undef to <8 x i64>
+  %v37 = fptoui <8 x fp128> undef to <8 x i32>
+  %v38 = fptoui <8 x fp128> undef to <8 x i16>
+  %v39 = fptoui <8 x fp128> undef to <8 x i8>
+  %v40 = fptoui <8 x double> undef to <8 x i64>
+  %v41 = fptoui <8 x double> undef to <8 x i32>
+  %v42 = fptoui <8 x double> undef to <8 x i16>
+  %v43 = fptoui <8 x double> undef to <8 x i8>
+  %v44 = fptoui <8 x float> undef to <8 x i64>
+  %v45 = fptoui <8 x float> undef to <8 x i32>
+  %v46 = fptoui <8 x float> undef to <8 x i16>
+  %v47 = fptoui <8 x float> undef to <8 x i8>
+  %v48 = fptoui <16 x double> undef to <16 x i64>
+  %v49 = fptoui <16 x double> undef to <16 x i32>
+  %v50 = fptoui <16 x double> undef to <16 x i16>
+  %v51 = fptoui <16 x double> undef to <16 x i8>
+  %v52 = fptoui <16 x float> undef to <16 x i64>
+  %v53 = fptoui <16 x float> undef to <16 x i32>
+  %v54 = fptoui <16 x float> undef to <16 x i16>
+  %v55 = fptoui <16 x float> undef to <16 x i8>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fptoui fp128 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fptoui fp128 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fptoui fp128 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = fptoui fp128 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = fptoui double undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = fptoui double undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = fptoui double undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = fptoui double undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = fptoui float undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = fptoui float undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = fptoui float undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = fptoui float undef to i8
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v12 = fptoui <2 x fp128> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v13 = fptoui <2 x fp128> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v14 = fptoui <2 x fp128> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = fptoui <2 x fp128> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = fptoui <2 x double> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v17 = fptoui <2 x double> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = fptoui <2 x double> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v19 = fptoui <2 x double> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v20 = fptoui <2 x float> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v21 = fptoui <2 x float> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v22 = fptoui <2 x float> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = fptoui <2 x float> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v24 = fptoui <4 x fp128> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v25 = fptoui <4 x fp128> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v26 = fptoui <4 x fp128> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = fptoui <4 x fp128> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v28 = fptoui <4 x double> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = fptoui <4 x double> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = fptoui <4 x double> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v31 = fptoui <4 x double> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %v32 = fptoui <4 x float> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = fptoui <4 x float> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v34 = fptoui <4 x float> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v35 = fptoui <4 x float> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v36 = fptoui <8 x fp128> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v37 = fptoui <8 x fp128> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v38 = fptoui <8 x fp128> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = fptoui <8 x fp128> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v40 = fptoui <8 x double> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = fptoui <8 x double> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = fptoui <8 x double> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v43 = fptoui <8 x double> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %v44 = fptoui <8 x float> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = fptoui <8 x float> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v46 = fptoui <8 x float> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v47 = fptoui <8 x float> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = fptoui <16 x double> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = fptoui <16 x double> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = fptoui <16 x double> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = fptoui <16 x double> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %v52 = fptoui <16 x float> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v53 = fptoui <16 x float> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v54 = fptoui <16 x float> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v55 = fptoui <16 x float> undef to <16 x i8>
+
+  ret void;
+}
+
+define void @fptrunc() {
+  %v0 = fptrunc fp128 undef to double
+  %v1 = fptrunc fp128 undef to float
+  %v2 = fptrunc double undef to float
+  %v3 = fptrunc <2 x fp128> undef to <2 x double>
+  %v4 = fptrunc <2 x fp128> undef to <2 x float>
+  %v5 = fptrunc <2 x double> undef to <2 x float>
+  %v6 = fptrunc <4 x fp128> undef to <4 x double>
+  %v7 = fptrunc <4 x fp128> undef to <4 x float>
+  %v8 = fptrunc <4 x double> undef to <4 x float>
+  %v9 = fptrunc <8 x fp128> undef to <8 x double>
+  %v10 = fptrunc <8 x fp128> undef to <8 x float>
+  %v11 = fptrunc <8 x double> undef to <8 x float>
+  %v12 = fptrunc <16 x double> undef to <16 x float>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fptrunc fp128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fptrunc fp128 undef to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fptrunc double undef to float
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v3 = fptrunc <2 x fp128> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v4 = fptrunc <2 x fp128> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v5 = fptrunc <2 x double> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v6 = fptrunc <4 x fp128> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v7 = fptrunc <4 x fp128> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v8 = fptrunc <4 x double> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v9 = fptrunc <8 x fp128> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v10 = fptrunc <8 x fp128> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v11 = fptrunc <8 x double> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v12 = fptrunc <16 x double> undef to <16 x float>
+
+  ret void;
+}
+
+define void @sitofp() {
+  %v0 = sitofp i64 undef to fp128
+  %v1 = sitofp i64 undef to double
+  %v2 = sitofp i64 undef to float
+  %v3 = sitofp i32 undef to fp128
+  %v4 = sitofp i32 undef to double
+  %v5 = sitofp i32 undef to float
+  %v6 = sitofp i16 undef to fp128
+  %v7 = sitofp i16 undef to double
+  %v8 = sitofp i16 undef to float
+  %v9 = sitofp i8 undef to fp128
+  %v10 = sitofp i8 undef to double
+  %v11 = sitofp i8 undef to float
+  %v12 = sitofp <2 x i64> undef to <2 x fp128>
+  %v13 = sitofp <2 x i64> undef to <2 x double>
+  %v14 = sitofp <2 x i64> undef to <2 x float>
+  %v15 = sitofp <2 x i32> undef to <2 x fp128>
+  %v16 = sitofp <2 x i32> undef to <2 x double>
+  %v17 = sitofp <2 x i32> undef to <2 x float>
+  %v18 = sitofp <2 x i16> undef to <2 x fp128>
+  %v19 = sitofp <2 x i16> undef to <2 x double>
+  %v20 = sitofp <2 x i16> undef to <2 x float>
+  %v21 = sitofp <2 x i8> undef to <2 x fp128>
+  %v22 = sitofp <2 x i8> undef to <2 x double>
+  %v23 = sitofp <2 x i8> undef to <2 x float>
+  %v24 = sitofp <4 x i64> undef to <4 x fp128>
+  %v25 = sitofp <4 x i64> undef to <4 x double>
+  %v26 = sitofp <4 x i64> undef to <4 x float>
+  %v27 = sitofp <4 x i32> undef to <4 x fp128>
+  %v28 = sitofp <4 x i32> undef to <4 x double>
+  %v29 = sitofp <4 x i32> undef to <4 x float>
+  %v30 = sitofp <4 x i16> undef to <4 x fp128>
+  %v31 = sitofp <4 x i16> undef to <4 x double>
+  %v32 = sitofp <4 x i16> undef to <4 x float>
+  %v33 = sitofp <4 x i8> undef to <4 x fp128>
+  %v34 = sitofp <4 x i8> undef to <4 x double>
+  %v35 = sitofp <4 x i8> undef to <4 x float>
+  %v36 = sitofp <8 x i64> undef to <8 x fp128>
+  %v37 = sitofp <8 x i64> undef to <8 x double>
+  %v38 = sitofp <8 x i64> undef to <8 x float>
+  %v39 = sitofp <8 x i32> undef to <8 x fp128>
+  %v40 = sitofp <8 x i32> undef to <8 x double>
+  %v41 = sitofp <8 x i32> undef to <8 x float>
+  %v42 = sitofp <8 x i16> undef to <8 x fp128>
+  %v43 = sitofp <8 x i16> undef to <8 x double>
+  %v44 = sitofp <8 x i16> undef to <8 x float>
+  %v45 = sitofp <8 x i8> undef to <8 x fp128>
+  %v46 = sitofp <8 x i8> undef to <8 x double>
+  %v47 = sitofp <8 x i8> undef to <8 x float>
+  %v48 = sitofp <16 x i64> undef to <16 x double>
+  %v49 = sitofp <16 x i64> undef to <16 x float>
+  %v50 = sitofp <16 x i32> undef to <16 x double>
+  %v51 = sitofp <16 x i32> undef to <16 x float>
+  %v52 = sitofp <16 x i16> undef to <16 x double>
+  %v53 = sitofp <16 x i16> undef to <16 x float>
+  %v54 = sitofp <16 x i8> undef to <16 x double>
+  %v55 = sitofp <16 x i8> undef to <16 x float>
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = sitofp i64 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = sitofp i64 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = sitofp i64 undef to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = sitofp i32 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = sitofp i32 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = sitofp i32 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v6 = sitofp i16 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = sitofp i16 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v8 = sitofp i16 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v9 = sitofp i8 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = sitofp i8 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v11 = sitofp i8 undef to float
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v12 = sitofp <2 x i64> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = sitofp <2 x i64> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v14 = sitofp <2 x i64> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = sitofp <2 x i32> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v16 = sitofp <2 x i32> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v17 = sitofp <2 x i32> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = sitofp <2 x i16> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v19 = sitofp <2 x i16> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v20 = sitofp <2 x i16> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v21 = sitofp <2 x i8> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v22 = sitofp <2 x i8> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v23 = sitofp <2 x i8> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v24 = sitofp <4 x i64> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = sitofp <4 x i64> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v26 = sitofp <4 x i64> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = sitofp <4 x i32> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v28 = sitofp <4 x i32> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = sitofp <4 x i32> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = sitofp <4 x i16> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v31 = sitofp <4 x i16> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v32 = sitofp <4 x i16> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = sitofp <4 x i8> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v34 = sitofp <4 x i8> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v35 = sitofp <4 x i8> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v36 = sitofp <8 x i64> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v37 = sitofp <8 x i64> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v38 = sitofp <8 x i64> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = sitofp <8 x i32> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v40 = sitofp <8 x i32> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = sitofp <8 x i32> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = sitofp <8 x i16> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v43 = sitofp <8 x i16> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v44 = sitofp <8 x i16> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = sitofp <8 x i8> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v46 = sitofp <8 x i8> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v47 = sitofp <8 x i8> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = sitofp <16 x i64> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = sitofp <16 x i64> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = sitofp <16 x i32> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = sitofp <16 x i32> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v52 = sitofp <16 x i16> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v53 = sitofp <16 x i16> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v54 = sitofp <16 x i8> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v55 = sitofp <16 x i8> undef to <16 x float>
+
+  ret void;
+}
+
+define void @uitofp() {
+  %v0 = uitofp i64 undef to fp128
+  %v1 = uitofp i64 undef to double
+  %v2 = uitofp i64 undef to float
+  %v3 = uitofp i32 undef to fp128
+  %v4 = uitofp i32 undef to double
+  %v5 = uitofp i32 undef to float
+  %v6 = uitofp i16 undef to fp128
+  %v7 = uitofp i16 undef to double
+  %v8 = uitofp i16 undef to float
+  %v9 = uitofp i8 undef to fp128
+  %v10 = uitofp i8 undef to double
+  %v11 = uitofp i8 undef to float
+  %v12 = uitofp <2 x i64> undef to <2 x fp128>
+  %v13 = uitofp <2 x i64> undef to <2 x double>
+  %v14 = uitofp <2 x i64> undef to <2 x float>
+  %v15 = uitofp <2 x i32> undef to <2 x fp128>
+  %v16 = uitofp <2 x i32> undef to <2 x double>
+  %v17 = uitofp <2 x i32> undef to <2 x float>
+  %v18 = uitofp <2 x i16> undef to <2 x fp128>
+  %v19 = uitofp <2 x i16> undef to <2 x double>
+  %v20 = uitofp <2 x i16> undef to <2 x float>
+  %v21 = uitofp <2 x i8> undef to <2 x fp128>
+  %v22 = uitofp <2 x i8> undef to <2 x double>
+  %v23 = uitofp <2 x i8> undef to <2 x float>
+  %v24 = uitofp <4 x i64> undef to <4 x fp128>
+  %v25 = uitofp <4 x i64> undef to <4 x double>
+  %v26 = uitofp <4 x i64> undef to <4 x float>
+  %v27 = uitofp <4 x i32> undef to <4 x fp128>
+  %v28 = uitofp <4 x i32> undef to <4 x double>
+  %v29 = uitofp <4 x i32> undef to <4 x float>
+  %v30 = uitofp <4 x i16> undef to <4 x fp128>
+  %v31 = uitofp <4 x i16> undef to <4 x double>
+  %v32 = uitofp <4 x i16> undef to <4 x float>
+  %v33 = uitofp <4 x i8> undef to <4 x fp128>
+  %v34 = uitofp <4 x i8> undef to <4 x double>
+  %v35 = uitofp <4 x i8> undef to <4 x float>
+  %v36 = uitofp <8 x i64> undef to <8 x fp128>
+  %v37 = uitofp <8 x i64> undef to <8 x double>
+  %v38 = uitofp <8 x i64> undef to <8 x float>
+  %v39 = uitofp <8 x i32> undef to <8 x fp128>
+  %v40 = uitofp <8 x i32> undef to <8 x double>
+  %v41 = uitofp <8 x i32> undef to <8 x float>
+  %v42 = uitofp <8 x i16> undef to <8 x fp128>
+  %v43 = uitofp <8 x i16> undef to <8 x double>
+  %v44 = uitofp <8 x i16> undef to <8 x float>
+  %v45 = uitofp <8 x i8> undef to <8 x fp128>
+  %v46 = uitofp <8 x i8> undef to <8 x double>
+  %v47 = uitofp <8 x i8> undef to <8 x float>
+  %v48 = uitofp <16 x i64> undef to <16 x double>
+  %v49 = uitofp <16 x i64> undef to <16 x float>
+  %v50 = uitofp <16 x i32> undef to <16 x double>
+  %v51 = uitofp <16 x i32> undef to <16 x float>
+  %v52 = uitofp <16 x i16> undef to <16 x double>
+  %v53 = uitofp <16 x i16> undef to <16 x float>
+  %v54 = uitofp <16 x i8> undef to <16 x double>
+  %v55 = uitofp <16 x i8> undef to <16 x float>
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = uitofp i64 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = uitofp i64 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = uitofp i64 undef to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = uitofp i32 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = uitofp i32 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = uitofp i32 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v6 = uitofp i16 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = uitofp i16 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v8 = uitofp i16 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v9 = uitofp i8 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = uitofp i8 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v11 = uitofp i8 undef to float
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v12 = uitofp <2 x i64> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = uitofp <2 x i64> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v14 = uitofp <2 x i64> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = uitofp <2 x i32> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v16 = uitofp <2 x i32> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v17 = uitofp <2 x i32> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = uitofp <2 x i16> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v19 = uitofp <2 x i16> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v20 = uitofp <2 x i16> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v21 = uitofp <2 x i8> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v22 = uitofp <2 x i8> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v23 = uitofp <2 x i8> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v24 = uitofp <4 x i64> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = uitofp <4 x i64> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v26 = uitofp <4 x i64> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = uitofp <4 x i32> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v28 = uitofp <4 x i32> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = uitofp <4 x i32> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = uitofp <4 x i16> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v31 = uitofp <4 x i16> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v32 = uitofp <4 x i16> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = uitofp <4 x i8> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v34 = uitofp <4 x i8> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v35 = uitofp <4 x i8> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v36 = uitofp <8 x i64> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v37 = uitofp <8 x i64> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v38 = uitofp <8 x i64> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = uitofp <8 x i32> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v40 = uitofp <8 x i32> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = uitofp <8 x i32> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = uitofp <8 x i16> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v43 = uitofp <8 x i16> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v44 = uitofp <8 x i16> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = uitofp <8 x i8> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v46 = uitofp <8 x i8> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v47 = uitofp <8 x i8> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = uitofp <16 x i64> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = uitofp <16 x i64> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = uitofp <16 x i32> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = uitofp <16 x i32> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v52 = uitofp <16 x i16> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v53 = uitofp <16 x i16> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v54 = uitofp <16 x i8> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v55 = uitofp <16 x i8> undef to <16 x float>
+
+  ret void;
+}
--- a/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/int-arith.ll
@ -0,0 +1,326 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: The scalarized vector instructions costs are not including any
+; extracts, due to the undef operands.
+
+define void @add() {
+  %res0 = add i8 undef, undef
+  %res1 = add i16 undef, undef
+  %res2 = add i32 undef, undef
+  %res3 = add i64 undef, undef
+  %res4 = add <2 x i8> undef, undef
+  %res5 = add <2 x i16> undef, undef
+  %res6 = add <2 x i32> undef, undef
+  %res7 = add <2 x i64> undef, undef
+  %res8 = add <4 x i8> undef, undef
+  %res9 = add <4 x i16> undef, undef
+  %res10 = add <4 x i32> undef, undef
+  %res11 = add <4 x i64> undef, undef
+  %res12 = add <8 x i8> undef, undef
+  %res13 = add <8 x i16> undef, undef
+  %res14 = add <8 x i32> undef, undef
+  %res15 = add <8 x i64> undef, undef
+  %res16 = add <16 x i8> undef, undef
+  %res17 = add <16 x i16> undef, undef
+  %res18 = add <16 x i32> undef, undef
+  %res19 = add <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = add i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = add i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = add i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = add i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = add <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = add <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = add <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = add <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = add <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = add <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = add <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = add <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = add <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = add <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = add <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = add <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = add <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = add <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = add <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = add <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @sub() {
+  %res0 = sub i8 undef, undef
+  %res1 = sub i16 undef, undef
+  %res2 = sub i32 undef, undef
+  %res3 = sub i64 undef, undef
+  %res4 = sub <2 x i8> undef, undef
+  %res5 = sub <2 x i16> undef, undef
+  %res6 = sub <2 x i32> undef, undef
+  %res7 = sub <2 x i64> undef, undef
+  %res8 = sub <4 x i8> undef, undef
+  %res9 = sub <4 x i16> undef, undef
+  %res10 = sub <4 x i32> undef, undef
+  %res11 = sub <4 x i64> undef, undef
+  %res12 = sub <8 x i8> undef, undef
+  %res13 = sub <8 x i16> undef, undef
+  %res14 = sub <8 x i32> undef, undef
+  %res15 = sub <8 x i64> undef, undef
+  %res16 = sub <16 x i8> undef, undef
+  %res17 = sub <16 x i16> undef, undef
+  %res18 = sub <16 x i32> undef, undef
+  %res19 = sub <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = sub i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = sub i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = sub i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sub i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = sub <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = sub <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = sub <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = sub <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = sub <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = sub <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = sub <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = sub <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = sub <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = sub <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = sub <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = sub <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = sub <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = sub <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = sub <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = sub <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @mul() {
+  %res0 = mul i8 undef, undef
+  %res1 = mul i16 undef, undef
+  %res2 = mul i32 undef, undef
+  %res3 = mul i64 undef, undef
+  %res4 = mul <2 x i8> undef, undef
+  %res5 = mul <2 x i16> undef, undef
+  %res6 = mul <2 x i32> undef, undef
+  %res7 = mul <2 x i64> undef, undef
+  %res8 = mul <4 x i8> undef, undef
+  %res9 = mul <4 x i16> undef, undef
+  %res10 = mul <4 x i32> undef, undef
+  %res11 = mul <4 x i64> undef, undef
+  %res12 = mul <8 x i8> undef, undef
+  %res13 = mul <8 x i16> undef, undef
+  %res14 = mul <8 x i32> undef, undef
+  %res15 = mul <8 x i64> undef, undef
+  %res16 = mul <16 x i8> undef, undef
+  %res17 = mul <16 x i16> undef, undef
+  %res18 = mul <16 x i32> undef, undef
+  %res19 = mul <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = mul i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = mul i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = mul i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = mul i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = mul <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = mul <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = mul <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = mul <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = mul <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = mul <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = mul <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = mul <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = mul <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = mul <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = mul <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = mul <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = mul <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = mul <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @sdiv() {
+  %res0 = sdiv i8 undef, undef
+  %res1 = sdiv i16 undef, undef
+  %res2 = sdiv i32 undef, undef
+  %res3 = sdiv i64 undef, undef
+  %res4 = sdiv <2 x i8> undef, undef
+  %res5 = sdiv <2 x i16> undef, undef
+  %res6 = sdiv <2 x i32> undef, undef
+  %res7 = sdiv <2 x i64> undef, undef
+  %res8 = sdiv <4 x i8> undef, undef
+  %res9 = sdiv <4 x i16> undef, undef
+  %res10 = sdiv <4 x i32> undef, undef
+  %res11 = sdiv <4 x i64> undef, undef
+  %res12 = sdiv <8 x i8> undef, undef
+  %res13 = sdiv <8 x i16> undef, undef
+  %res14 = sdiv <8 x i32> undef, undef
+  %res15 = sdiv <8 x i64> undef, undef
+  %res16 = sdiv <16 x i8> undef, undef
+  %res17 = sdiv <16 x i16> undef, undef
+  %res18 = sdiv <16 x i32> undef, undef
+  %res19 = sdiv <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = sdiv i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = sdiv i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = sdiv i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sdiv i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = sdiv <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = sdiv <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = sdiv <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = sdiv <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = sdiv <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = sdiv <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = sdiv <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = sdiv <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = sdiv <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = sdiv <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = sdiv <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = sdiv <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = sdiv <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = sdiv <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = sdiv <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = sdiv <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @srem() {
+  %res0 = srem i8 undef, undef
+  %res1 = srem i16 undef, undef
+  %res2 = srem i32 undef, undef
+  %res3 = srem i64 undef, undef
+  %res4 = srem <2 x i8> undef, undef
+  %res5 = srem <2 x i16> undef, undef
+  %res6 = srem <2 x i32> undef, undef
+  %res7 = srem <2 x i64> undef, undef
+  %res8 = srem <4 x i8> undef, undef
+  %res9 = srem <4 x i16> undef, undef
+  %res10 = srem <4 x i32> undef, undef
+  %res11 = srem <4 x i64> undef, undef
+  %res12 = srem <8 x i8> undef, undef
+  %res13 = srem <8 x i16> undef, undef
+  %res14 = srem <8 x i32> undef, undef
+  %res15 = srem <8 x i64> undef, undef
+  %res16 = srem <16 x i8> undef, undef
+  %res17 = srem <16 x i16> undef, undef
+  %res18 = srem <16 x i32> undef, undef
+  %res19 = srem <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = srem i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = srem i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = srem i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = srem i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = srem <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = srem <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = srem <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = srem <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = srem <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = srem <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = srem <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = srem <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = srem <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = srem <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = srem <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = srem <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = srem <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = srem <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = srem <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = srem <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @udiv() {
+  %res0 = udiv i8 undef, undef
+  %res1 = udiv i16 undef, undef
+  %res2 = udiv i32 undef, undef
+  %res3 = udiv i64 undef, undef
+  %res4 = udiv <2 x i8> undef, undef
+  %res5 = udiv <2 x i16> undef, undef
+  %res6 = udiv <2 x i32> undef, undef
+  %res7 = udiv <2 x i64> undef, undef
+  %res8 = udiv <4 x i8> undef, undef
+  %res9 = udiv <4 x i16> undef, undef
+  %res10 = udiv <4 x i32> undef, undef
+  %res11 = udiv <4 x i64> undef, undef
+  %res12 = udiv <8 x i8> undef, undef
+  %res13 = udiv <8 x i16> undef, undef
+  %res14 = udiv <8 x i32> undef, undef
+  %res15 = udiv <8 x i64> undef, undef
+  %res16 = udiv <16 x i8> undef, undef
+  %res17 = udiv <16 x i16> undef, undef
+  %res18 = udiv <16 x i32> undef, undef
+  %res19 = udiv <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = udiv i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = udiv i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = udiv i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = udiv i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = udiv <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = udiv <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = udiv <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = udiv <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = udiv <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = udiv <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = udiv <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = udiv <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = udiv <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = udiv <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = udiv <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res15 = udiv <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = udiv <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = udiv <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = udiv <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res19 = udiv <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @urem() {
+  %res0 = urem i8 undef, undef
+  %res1 = urem i16 undef, undef
+  %res2 = urem i32 undef, undef
+  %res3 = urem i64 undef, undef
+  %res4 = urem <2 x i8> undef, undef
+  %res5 = urem <2 x i16> undef, undef
+  %res6 = urem <2 x i32> undef, undef
+  %res7 = urem <2 x i64> undef, undef
+  %res8 = urem <4 x i8> undef, undef
+  %res9 = urem <4 x i16> undef, undef
+  %res10 = urem <4 x i32> undef, undef
+  %res11 = urem <4 x i64> undef, undef
+  %res12 = urem <8 x i8> undef, undef
+  %res13 = urem <8 x i16> undef, undef
+  %res14 = urem <8 x i32> undef, undef
+  %res15 = urem <8 x i64> undef, undef
+  %res16 = urem <16 x i8> undef, undef
+  %res17 = urem <16 x i16> undef, undef
+  %res18 = urem <16 x i32> undef, undef
+  %res19 = urem <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = urem i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = urem i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = urem i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = urem i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = urem <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = urem <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = urem <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = urem <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = urem <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = urem <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = urem <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = urem <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = urem <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = urem <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = urem <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res15 = urem <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = urem <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = urem <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = urem <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res19 = urem <16 x i64> undef, undef
+
+  ret void;
+}
--- a/test/Analysis/CostModel/SystemZ/int-cast.ll
+++ b/test/Analysis/CostModel/SystemZ/int-cast.ll
@ -0,0 +1,199 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @sext() {
+  %v0 = sext i8 undef to i16
+  %v1 = sext i8 undef to i32
+  %v2 = sext i8 undef to i64
+  %v3 = sext i16 undef to i32
+  %v4 = sext i16 undef to i64
+  %v5 = sext i32 undef to i64
+  %v6 = sext <2 x i8> undef to <2 x i16>
+  %v7 = sext <2 x i8> undef to <2 x i32>
+  %v8 = sext <2 x i8> undef to <2 x i64>
+  %v9 = sext <2 x i16> undef to <2 x i32>
+  %v10 = sext <2 x i16> undef to <2 x i64>
+  %v11 = sext <2 x i32> undef to <2 x i64>
+  %v12 = sext <4 x i8> undef to <4 x i16>
+  %v13 = sext <4 x i8> undef to <4 x i32>
+  %v14 = sext <4 x i8> undef to <4 x i64>
+  %v15 = sext <4 x i16> undef to <4 x i32>
+  %v16 = sext <4 x i16> undef to <4 x i64>
+  %v17 = sext <4 x i32> undef to <4 x i64>
+  %v18 = sext <8 x i8> undef to <8 x i16>
+  %v19 = sext <8 x i8> undef to <8 x i32>
+  %v20 = sext <8 x i8> undef to <8 x i64>
+  %v21 = sext <8 x i16> undef to <8 x i32>
+  %v22 = sext <8 x i16> undef to <8 x i64>
+  %v23 = sext <8 x i32> undef to <8 x i64>
+  %v24 = sext <16 x i8> undef to <16 x i16>
+  %v25 = sext <16 x i8> undef to <16 x i32>
+  %v26 = sext <16 x i8> undef to <16 x i64>
+  %v27 = sext <16 x i16> undef to <16 x i32>
+  %v28 = sext <16 x i16> undef to <16 x i64>
+  %v29 = sext <16 x i32> undef to <16 x i64>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = sext i8 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = sext i8 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = sext i8 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = sext i16 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = sext i16 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = sext i32 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = sext <2 x i8> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = sext <2 x i8> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v8 = sext <2 x i8> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = sext <2 x i16> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = sext <2 x i16> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = sext <2 x i32> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v12 = sext <4 x i8> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v13 = sext <4 x i8> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v14 = sext <4 x i8> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v15 = sext <4 x i16> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v16 = sext <4 x i16> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v17 = sext <4 x i32> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v18 = sext <8 x i8> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v19 = sext <8 x i8> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 15 for instruction:   %v20 = sext <8 x i8> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v21 = sext <8 x i16> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v22 = sext <8 x i16> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = sext <8 x i32> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v24 = sext <16 x i8> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v25 = sext <16 x i8> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 31 for instruction:   %v26 = sext <16 x i8> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v27 = sext <16 x i16> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 22 for instruction:   %v28 = sext <16 x i16> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = sext <16 x i32> undef to <16 x i64>
+
+ ret void
+}
+
+define void @zext() {
+  %v0 = zext i8 undef to i16
+  %v1 = zext i8 undef to i32
+  %v2 = zext i8 undef to i64
+  %v3 = zext i16 undef to i32
+  %v4 = zext i16 undef to i64
+  %v5 = zext i32 undef to i64
+  %v6 = zext <2 x i8> undef to <2 x i16>
+  %v7 = zext <2 x i8> undef to <2 x i32>
+  %v8 = zext <2 x i8> undef to <2 x i64>
+  %v9 = zext <2 x i16> undef to <2 x i32>
+  %v10 = zext <2 x i16> undef to <2 x i64>
+  %v11 = zext <2 x i32> undef to <2 x i64>
+  %v12 = zext <4 x i8> undef to <4 x i16>
+  %v13 = zext <4 x i8> undef to <4 x i32>
+  %v14 = zext <4 x i8> undef to <4 x i64>
+  %v15 = zext <4 x i16> undef to <4 x i32>
+  %v16 = zext <4 x i16> undef to <4 x i64>
+  %v17 = zext <4 x i32> undef to <4 x i64>
+  %v18 = zext <8 x i8> undef to <8 x i16>
+  %v19 = zext <8 x i8> undef to <8 x i32>
+  %v20 = zext <8 x i8> undef to <8 x i64>
+  %v21 = zext <8 x i16> undef to <8 x i32>
+  %v22 = zext <8 x i16> undef to <8 x i64>
+  %v23 = zext <8 x i32> undef to <8 x i64>
+  %v24 = zext <16 x i8> undef to <16 x i16>
+  %v25 = zext <16 x i8> undef to <16 x i32>
+  %v26 = zext <16 x i8> undef to <16 x i64>
+  %v27 = zext <16 x i16> undef to <16 x i32>
+  %v28 = zext <16 x i16> undef to <16 x i64>
+  %v29 = zext <16 x i32> undef to <16 x i64>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = zext i8 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = zext i8 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = zext i8 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = zext i16 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = zext i16 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = zext i32 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = zext <2 x i8> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = zext <2 x i8> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v8 = zext <2 x i8> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = zext <2 x i16> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = zext <2 x i16> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = zext <2 x i32> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v12 = zext <4 x i8> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v13 = zext <4 x i8> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v14 = zext <4 x i8> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v15 = zext <4 x i16> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v16 = zext <4 x i16> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v17 = zext <4 x i32> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v18 = zext <8 x i8> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v19 = zext <8 x i8> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 15 for instruction:   %v20 = zext <8 x i8> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v21 = zext <8 x i16> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v22 = zext <8 x i16> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = zext <8 x i32> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v24 = zext <16 x i8> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v25 = zext <16 x i8> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 31 for instruction:   %v26 = zext <16 x i8> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v27 = zext <16 x i16> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 22 for instruction:   %v28 = zext <16 x i16> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = zext <16 x i32> undef to <16 x i64>
+
+ ret void
+}
+
+define void @trunc() {
+  %v0 = trunc i16 undef to i8
+  %v1 = trunc i32 undef to i16
+  %v2 = trunc i32 undef to i8
+  %v3 = trunc i64 undef to i32
+  %v4 = trunc i64 undef to i16
+  %v5 = trunc i64 undef to i8
+  %v6 = trunc <2 x i16> undef to <2 x i8>
+  %v7 = trunc <2 x i32> undef to <2 x i16>
+  %v8 = trunc <2 x i32> undef to <2 x i8>
+  %v9 = trunc <2 x i64> undef to <2 x i32>
+  %v10 = trunc <2 x i64> undef to <2 x i16>
+  %v11 = trunc <2 x i64> undef to <2 x i8>
+  %v12 = trunc <4 x i16> undef to <4 x i8>
+  %v13 = trunc <4 x i32> undef to <4 x i16>
+  %v14 = trunc <4 x i32> undef to <4 x i8>
+  %v15 = trunc <4 x i64> undef to <4 x i32>
+  %v16 = trunc <4 x i64> undef to <4 x i16>
+  %v17 = trunc <4 x i64> undef to <4 x i8>
+  %v18 = trunc <8 x i16> undef to <8 x i8>
+  %v19 = trunc <8 x i32> undef to <8 x i16>
+  %v20 = trunc <8 x i32> undef to <8 x i8>
+  %v21 = trunc <8 x i64> undef to <8 x i32>
+  %v22 = trunc <8 x i64> undef to <8 x i16>
+  %v23 = trunc <8 x i64> undef to <8 x i8>
+  %v24 = trunc <16 x i16> undef to <16 x i8>
+  %v25 = trunc <16 x i32> undef to <16 x i16>
+  %v26 = trunc <16 x i32> undef to <16 x i8>
+  %v27 = trunc <16 x i64> undef to <16 x i32>
+  %v28 = trunc <16 x i64> undef to <16 x i16>
+  %v29 = trunc <16 x i64> undef to <16 x i8>
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v0 = trunc i16 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v1 = trunc i32 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v2 = trunc i32 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v3 = trunc i64 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v4 = trunc i64 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v5 = trunc i64 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = trunc <2 x i16> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = trunc <2 x i32> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = trunc <2 x i64> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = trunc <2 x i64> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = trunc <2 x i64> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v12 = trunc <4 x i16> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = trunc <4 x i32> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v14 = trunc <4 x i32> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v15 = trunc <4 x i64> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = trunc <4 x i64> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v17 = trunc <4 x i64> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v18 = trunc <8 x i16> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v19 = trunc <8 x i32> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v20 = trunc <8 x i32> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v21 = trunc <8 x i64> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v22 = trunc <8 x i64> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v23 = trunc <8 x i64> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v24 = trunc <16 x i16> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = trunc <16 x i32> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v26 = trunc <16 x i32> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v27 = trunc <16 x i64> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v28 = trunc <16 x i64> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v29 = trunc <16 x i64> undef to <16 x i8>
+
+ ret void
+}
--- a/test/Analysis/CostModel/SystemZ/load_store.ll
+++ b/test/Analysis/CostModel/SystemZ/load_store.ll
@ -0,0 +1,137 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @store() {
+  store i8 undef, i8* undef
+  store i16 undef, i16* undef
+  store i32 undef, i32* undef
+  store i64 undef, i64* undef
+  store float undef, float* undef
+  store double undef, double* undef
+  store fp128 undef, fp128* undef
+  store <2 x i8> undef, <2 x i8>* undef
+  store <2 x i16> undef, <2 x i16>* undef
+  store <2 x i32> undef, <2 x i32>* undef
+  store <2 x i64> undef, <2 x i64>* undef
+  store <2 x float> undef, <2 x float>* undef
+  store <2 x double> undef, <2 x double>* undef
+  store <4 x i8> undef, <4 x i8>* undef
+  store <4 x i16> undef, <4 x i16>* undef
+  store <4 x i32> undef, <4 x i32>* undef
+  store <4 x i64> undef, <4 x i64>* undef
+  store <4 x float> undef, <4 x float>* undef
+  store <4 x double> undef, <4 x double>* undef
+  store <8 x i8> undef, <8 x i8>* undef
+  store <8 x i16> undef, <8 x i16>* undef
+  store <8 x i32> undef, <8 x i32>* undef
+  store <8 x i64> undef, <8 x i64>* undef
+  store <8 x float> undef, <8 x float>* undef
+  store <8 x double> undef, <8 x double>* undef
+  store <16 x i8> undef, <16 x i8>* undef
+  store <16 x i16> undef, <16 x i16>* undef
+  store <16 x i32> undef, <16 x i32>* undef
+  store <16 x i64> undef, <16 x i64>* undef
+  store <16 x float> undef, <16 x float>* undef
+  store <16 x double> undef, <16 x double>* undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i8 undef, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i16 undef, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i32 undef, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i64 undef, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store float undef, float* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store double undef, double* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, fp128* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i8> undef, <2 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i16> undef, <2 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i32> undef, <2 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i64> undef, <2 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x float> undef, <2 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x double> undef, <2 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x i8> undef, <4 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x i16> undef, <4 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x i32> undef, <4 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <4 x i64> undef, <4 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x float> undef, <4 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <4 x double> undef, <4 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <8 x i8> undef, <8 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <8 x i16> undef, <8 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <8 x i32> undef, <8 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <8 x i64> undef, <8 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <8 x float> undef, <8 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <8 x double> undef, <8 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <16 x i8> undef, <16 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <16 x i16> undef, <16 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <16 x i32> undef, <16 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   store <16 x i64> undef, <16 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <16 x float> undef, <16 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   store <16 x double> undef, <16 x double>* undef
+
+  ret void;
+}
+
+define void @load() {
+  load i8, i8* undef
+  load i16, i16* undef
+  load i32, i32* undef
+  load i64, i64* undef
+  load float, float* undef
+  load double, double* undef
+  load fp128, fp128* undef
+  load <2 x i8>, <2 x i8>* undef
+  load <2 x i16>, <2 x i16>* undef
+  load <2 x i32>, <2 x i32>* undef
+  load <2 x i64>, <2 x i64>* undef
+  load <2 x float>, <2 x float>* undef
+  load <2 x double>, <2 x double>* undef
+  load <4 x i8>, <4 x i8>* undef
+  load <4 x i16>, <4 x i16>* undef
+  load <4 x i32>, <4 x i32>* undef
+  load <4 x i64>, <4 x i64>* undef
+  load <4 x float>, <4 x float>* undef
+  load <4 x double>, <4 x double>* undef
+  load <8 x i8>, <8 x i8>* undef
+  load <8 x i16>, <8 x i16>* undef
+  load <8 x i32>, <8 x i32>* undef
+  load <8 x i64>, <8 x i64>* undef
+  load <8 x float>, <8 x float>* undef
+  load <8 x double>, <8 x double>* undef
+  load <16 x i8>, <16 x i8>* undef
+  load <16 x i16>, <16 x i16>* undef
+  load <16 x i32>, <16 x i32>* undef
+  load <16 x i64>, <16 x i64>* undef
+  load <16 x float>, <16 x float>* undef
+  load <16 x double>, <16 x double>* undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = load float, float* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = load double, double* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %7 = load fp128, fp128* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = load <2 x i8>, <2 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = load <2 x i16>, <2 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = load <2 x i32>, <2 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = load <2 x i64>, <2 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = load <2 x float>, <2 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = load <2 x double>, <2 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = load <4 x i8>, <4 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = load <4 x i16>, <4 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = load <4 x i32>, <4 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %17 = load <4 x i64>, <4 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %18 = load <4 x float>, <4 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %19 = load <4 x double>, <4 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %20 = load <8 x i8>, <8 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %21 = load <8 x i16>, <8 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %22 = load <8 x i32>, <8 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %23 = load <8 x i64>, <8 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %24 = load <8 x float>, <8 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %25 = load <8 x double>, <8 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %26 = load <16 x i8>, <16 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %27 = load <16 x i16>, <16 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %28 = load <16 x i32>, <16 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %29 = load <16 x i64>, <16 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %30 = load <16 x float>, <16 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %31 = load <16 x double>, <16 x double>* undef
+
+  ret void;
+}
--- a/test/Analysis/CostModel/SystemZ/logical.ll
+++ b/test/Analysis/CostModel/SystemZ/logical.ll
@ -0,0 +1,277 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @and() {
+  %res0 = and i8 undef, undef
+  %res1 = and i16 undef, undef
+  %res2 = and i32 undef, undef
+  %res3 = and i64 undef, undef
+  %res4 = and <2 x i8> undef, undef
+  %res5 = and <2 x i16> undef, undef
+  %res6 = and <2 x i32> undef, undef
+  %res7 = and <2 x i64> undef, undef
+  %res8 = and <4 x i8> undef, undef
+  %res9 = and <4 x i16> undef, undef
+  %res10 = and <4 x i32> undef, undef
+  %res11 = and <4 x i64> undef, undef
+  %res12 = and <8 x i8> undef, undef
+  %res13 = and <8 x i16> undef, undef
+  %res14 = and <8 x i32> undef, undef
+  %res15 = and <8 x i64> undef, undef
+  %res16 = and <16 x i8> undef, undef
+  %res17 = and <16 x i16> undef, undef
+  %res18 = and <16 x i32> undef, undef
+  %res19 = and <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = and i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = and i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = and i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = and i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = and <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = and <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = and <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = and <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = and <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = and <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = and <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = and <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = and <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = and <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = and <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = and <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = and <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = and <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = and <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = and <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @ashr() {
+  %res0 = ashr i8 undef, undef
+  %res1 = ashr i16 undef, undef
+  %res2 = ashr i32 undef, undef
+  %res3 = ashr i64 undef, undef
+  %res4 = ashr <2 x i8> undef, undef
+  %res5 = ashr <2 x i16> undef, undef
+  %res6 = ashr <2 x i32> undef, undef
+  %res7 = ashr <2 x i64> undef, undef
+  %res8 = ashr <4 x i8> undef, undef
+  %res9 = ashr <4 x i16> undef, undef
+  %res10 = ashr <4 x i32> undef, undef
+  %res11 = ashr <4 x i64> undef, undef
+  %res12 = ashr <8 x i8> undef, undef
+  %res13 = ashr <8 x i16> undef, undef
+  %res14 = ashr <8 x i32> undef, undef
+  %res15 = ashr <8 x i64> undef, undef
+  %res16 = ashr <16 x i8> undef, undef
+  %res17 = ashr <16 x i16> undef, undef
+  %res18 = ashr <16 x i32> undef, undef
+  %res19 = ashr <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res0 = ashr i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res1 = ashr i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = ashr i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = ashr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = ashr <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = ashr <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = ashr <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = ashr <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = ashr <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = ashr <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = ashr <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = ashr <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = ashr <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = ashr <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = ashr <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = ashr <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = ashr <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = ashr <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = ashr <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = ashr <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @lshr() {
+  %res0 = lshr i8 undef, undef
+  %res1 = lshr i16 undef, undef
+  %res2 = lshr i32 undef, undef
+  %res3 = lshr i64 undef, undef
+  %res4 = lshr <2 x i8> undef, undef
+  %res5 = lshr <2 x i16> undef, undef
+  %res6 = lshr <2 x i32> undef, undef
+  %res7 = lshr <2 x i64> undef, undef
+  %res8 = lshr <4 x i8> undef, undef
+  %res9 = lshr <4 x i16> undef, undef
+  %res10 = lshr <4 x i32> undef, undef
+  %res11 = lshr <4 x i64> undef, undef
+  %res12 = lshr <8 x i8> undef, undef
+  %res13 = lshr <8 x i16> undef, undef
+  %res14 = lshr <8 x i32> undef, undef
+  %res15 = lshr <8 x i64> undef, undef
+  %res16 = lshr <16 x i8> undef, undef
+  %res17 = lshr <16 x i16> undef, undef
+  %res18 = lshr <16 x i32> undef, undef
+  %res19 = lshr <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res0 = lshr i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res1 = lshr i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = lshr i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = lshr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = lshr <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = lshr <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = lshr <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = lshr <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = lshr <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = lshr <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = lshr <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = lshr <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = lshr <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = lshr <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = lshr <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = lshr <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = lshr <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = lshr <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = lshr <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = lshr <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @or() {
+  %res0 = or i8 undef, undef
+  %res1 = or i16 undef, undef
+  %res2 = or i32 undef, undef
+  %res3 = or i64 undef, undef
+  %res4 = or <2 x i8> undef, undef
+  %res5 = or <2 x i16> undef, undef
+  %res6 = or <2 x i32> undef, undef
+  %res7 = or <2 x i64> undef, undef
+  %res8 = or <4 x i8> undef, undef
+  %res9 = or <4 x i16> undef, undef
+  %res10 = or <4 x i32> undef, undef
+  %res11 = or <4 x i64> undef, undef
+  %res12 = or <8 x i8> undef, undef
+  %res13 = or <8 x i16> undef, undef
+  %res14 = or <8 x i32> undef, undef
+  %res15 = or <8 x i64> undef, undef
+  %res16 = or <16 x i8> undef, undef
+  %res17 = or <16 x i16> undef, undef
+  %res18 = or <16 x i32> undef, undef
+  %res19 = or <16 x i64> undef, undef
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = or i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = or i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = or i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = or i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = or <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = or <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = or <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = or <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = or <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = or <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = or <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = or <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = or <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = or <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = or <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = or <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = or <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = or <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = or <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = or <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @shl() {
+  %res0 = shl i8 undef, undef
+  %res1 = shl i16 undef, undef
+  %res2 = shl i32 undef, undef
+  %res3 = shl i64 undef, undef
+  %res4 = shl <2 x i8> undef, undef
+  %res5 = shl <2 x i16> undef, undef
+  %res6 = shl <2 x i32> undef, undef
+  %res7 = shl <2 x i64> undef, undef
+  %res8 = shl <4 x i8> undef, undef
+  %res9 = shl <4 x i16> undef, undef
+  %res10 = shl <4 x i32> undef, undef
+  %res11 = shl <4 x i64> undef, undef
+  %res12 = shl <8 x i8> undef, undef
+  %res13 = shl <8 x i16> undef, undef
+  %res14 = shl <8 x i32> undef, undef
+  %res15 = shl <8 x i64> undef, undef
+  %res16 = shl <16 x i8> undef, undef
+  %res17 = shl <16 x i16> undef, undef
+  %res18 = shl <16 x i32> undef, undef
+  %res19 = shl <16 x i64> undef, undef
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = shl i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = shl i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = shl i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = shl i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = shl <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = shl <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = shl <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = shl <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = shl <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = shl <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = shl <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = shl <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = shl <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = shl <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = shl <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = shl <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = shl <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = shl <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = shl <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = shl <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @xor() {
+  %res0 = xor i8 undef, undef
+  %res1 = xor i16 undef, undef
+  %res2 = xor i32 undef, undef
+  %res3 = xor i64 undef, undef
+  %res4 = xor <2 x i8> undef, undef
+  %res5 = xor <2 x i16> undef, undef
+  %res6 = xor <2 x i32> undef, undef
+  %res7 = xor <2 x i64> undef, undef
+  %res8 = xor <4 x i8> undef, undef
+  %res9 = xor <4 x i16> undef, undef
+  %res10 = xor <4 x i32> undef, undef
+  %res11 = xor <4 x i64> undef, undef
+  %res12 = xor <8 x i8> undef, undef
+  %res13 = xor <8 x i16> undef, undef
+  %res14 = xor <8 x i32> undef, undef
+  %res15 = xor <8 x i64> undef, undef
+  %res16 = xor <16 x i8> undef, undef
+  %res17 = xor <16 x i16> undef, undef
+  %res18 = xor <16 x i32> undef, undef
+  %res19 = xor <16 x i64> undef, undef
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = xor i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = xor i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = xor i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = xor i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = xor <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = xor <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = xor <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = xor <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = xor <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = xor <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = xor <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = xor <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = xor <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = xor <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = xor <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = xor <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = xor <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = xor <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = xor <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = xor <16 x i64> undef, undef
+
+  ret void;
+}
--- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@ -0,0 +1,259 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Test that loads into operations that can fold one memory operand get zero
+; cost. In the case that both operands are loaded, one load should get a cost
+; value.
+
+define void @add() {
+  %li32 = load i32, i32* undef
+  add i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  add i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  add i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  add i64 %li64_0, %li64_1
+
+  ret void;
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = add i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = add i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = add i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = add i64 %li64_0, %li64_1
+}
+
+define void @sub() {
+  %li32 = load i32, i32* undef
+  sub i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  sub i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  sub i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  sub i64 %li64_0, %li64_1
+
+  ret void;
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = sub i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = sub i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sub i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i64 %li64_0, %li64_1
+}
+
+define void @mul() {
+  %li32 = load i32, i32* undef
+  mul i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  mul i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  mul i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  mul i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = mul i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = mul i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = mul i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = mul i64 %li64_0, %li64_1
+}
+
+define void @sdiv() {
+  %li32 = load i32, i32* undef
+  sdiv i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  sdiv i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  sdiv i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  sdiv i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = sdiv i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sdiv i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sdiv i64 %li64_0, %li64_1
+}
+
+define void @udiv() {
+  %li32 = load i32, i32* undef
+  udiv i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  udiv i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  udiv i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  udiv i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = udiv i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = udiv i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+}
+
+define void @and() {
+  %li32 = load i32, i32* undef
+  and i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  and i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  and i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  and i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = and i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = and i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = and i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = and i64 %li64_0, %li64_1
+}
+
+define void @or() {
+  %li32 = load i32, i32* undef
+  or i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  or i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  or i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  or i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = or i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = or i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = or i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = or i64 %li64_0, %li64_1
+}
+
+define void @xor() {
+  %li32 = load i32, i32* undef
+  xor i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  xor i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  xor i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  xor i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = xor i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = xor i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = xor i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = xor i64 %li64_0, %li64_1
+}
+
+define void @icmp() {
+  %li32 = load i32, i32* undef
+  icmp eq i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  icmp eq i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  icmp eq i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  icmp eq i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = icmp eq i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = icmp eq i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = icmp eq i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = icmp eq i64 %li64_0, %li64_1
+}
--- a/test/Analysis/CostModel/SystemZ/scalar-cmp-cmp-log-sel.ll
+++ b/test/Analysis/CostModel/SystemZ/scalar-cmp-cmp-log-sel.ll
--- a/test/Analysis/CostModel/SystemZ/shuffle.ll
+++ b/test/Analysis/CostModel/SystemZ/shuffle.ll
@ -0,0 +1,112 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; CHECK: shuffle
+define void @shuffle() {
+
+  ;; Reverse shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+
+  ;; Alternate shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 0, i32 3>
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 2, i32 1>
+
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 0, i32 3>
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+
+  ;; Broadcast shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+  shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+  shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+  shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+  shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+  shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+
+  ;; Random shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31>
+  shufflevector <18 x i8> undef, <18 x i8> undef, <18 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31, i32 0, i32 1>
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15>
+  shufflevector <12 x i16> undef, <12 x i16> undef, <12 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15, i32 9, i32 2, i32 2, i32 4>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 4, i32 7>
+  shufflevector <6 x i32> undef, <6 x i32> undef, <6 x i32> <i32 0, i32 0, i32 4, i32 7, i32 4, i32 7>
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 2>
+  shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 2>
+
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+  shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %17 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 0, i32 3>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %18 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 2, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %19 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 0, i32 3>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %20 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %21 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %22 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %23 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %24 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %25 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %26 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %27 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %28 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %29 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %30 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %31 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %32 = shufflevector <18 x i8> undef, <18 x i8> undef, <18 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31, i32 0, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %33 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %34 = shufflevector <12 x i16> undef, <12 x i16> undef, <12 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15, i32 9, i32 2, i32 2, i32 4>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %35 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 4, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %36 = shufflevector <6 x i32> undef, <6 x i32> undef, <6 x i32> <i32 0, i32 0, i32 4, i32 7, i32 4, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %37 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 2>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %38 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 2>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %39 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %40 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
+}
--- a/test/Analysis/CostModel/SystemZ/vectorinstrs.ll
+++ b/test/Analysis/CostModel/SystemZ/vectorinstrs.ll
@ -0,0 +1,56 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; CHECK: vecinstrs
+define void @vecinstrs() {
+
+  ;; Extract element is penalized somewhat with a cost of 2 for index 0.
+  extractelement <16 x i8> undef, i32 0
+  extractelement <16 x i8> undef, i32 1
+
+  extractelement <8 x i16> undef, i32 0
+  extractelement <8 x i16> undef, i32 1
+
+  extractelement <4 x i32> undef, i32 0
+  extractelement <4 x i32> undef, i32 1
+
+  extractelement <2 x i64> undef, i32 0
+  extractelement <2 x i64> undef, i32 1
+
+  extractelement <2 x double> undef, i32 0
+  extractelement <2 x double> undef, i32 1
+
+  ; Extraction of i1 means extract + test under mask before branch.
+  extractelement <2 x i1> undef, i32 0
+  extractelement <4 x i1> undef, i32 1
+  extractelement <8 x i1> undef, i32 2
+
+  ;; Insert element
+  insertelement <16 x i8> undef, i8 undef, i32 0
+  insertelement <8 x i16> undef, i16 undef, i32 0
+  insertelement <4 x i32> undef, i32 undef, i32 0
+
+  ; vlvgp will do two grs into a vector register: only add cost half of the time.
+  insertelement <2 x i64> undef, i64 undef, i32 0
+  insertelement <2 x i64> undef, i64 undef, i32 1
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = extractelement <16 x i8> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = extractelement <16 x i8> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = extractelement <8 x i16> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = extractelement <8 x i16> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %5 = extractelement <4 x i32> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = extractelement <4 x i32> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %7 = extractelement <2 x i64> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = extractelement <2 x i64> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = extractelement <2 x double> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = extractelement <2 x double> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %11 = extractelement <2 x i1> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %12 = extractelement <4 x i1> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %13 = extractelement <8 x i1> undef, i32 2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = insertelement <16 x i8> undef, i8 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = insertelement <8 x i16> undef, i16 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = insertelement <4 x i32> undef, i32 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %17 = insertelement <2 x i64> undef, i64 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %18 = insertelement <2 x i64> undef, i64 undef, i32 1
+}
--- a/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
+++ b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
@ -0,0 +1,70 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that the loop vectorizer performs memory interleaving with accurate
+; cost estimations.
+
+
+; Simple case where just the load is interleaved, because the store group
+; would have gaps.
+define void @fun0(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+;        (vl; vl; vperm)
+}
+
+; Interleaving of both load and stores.
+define void @fun1(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %i_1  = add i64 %i, 1
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 %i_1
+  %tmp3 = load i32, i32* %tmp2, align 4
+  store i32 %tmp1, i32* %tmp2, align 4
+  store i32 %tmp3, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Inserted:  store i32 %tmp1, i32* %tmp2, align 4
+; CHECK:     into the interleave group with  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Creating an interleave group with:  %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: LV: Inserted:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK:     into the interleave group with  %tmp3 = load i32, i32* %tmp2, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+;            (vl; vl; vperm, vpkg)
+
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+;            (vmrlf; vmrhf; vst; vst)
+}
+