[InstCombine] Move target-specific inst combining

For a long time, the InstCombine pass handled target specific intrinsics. Having target specific code in general passes was noted as an area for improvement for a long time. D81728 moves most target specific code out of the InstCombine pass. Applying the target specific combinations in an extra pass would probably result in inferior optimizations compared to the current fixed-point iteration, therefore the InstCombine pass resorts to newly introduced functions in the TargetTransformInfo when it encounters unknown intrinsics. The patch should not have any effect on generated code (under the assumption that code never uses intrinsics from a foreign target). This introduces three new functions: TargetTransformInfo::instCombineIntrinsic TargetTransformInfo::simplifyDemandedUseBitsIntrinsic TargetTransformInfo::simplifyDemandedVectorEltsIntrinsic A few target specific parts are left in the InstCombine folder, where it makes sense to share code. The largest left-over part in InstCombineCalls.cpp is the code shared between arm and aarch64. This allows to move about 3000 lines out from InstCombine to the targets. Differential Revision: https://reviews.llvm.org/D81728
2024-11-22 10:42:39 +01:00 · 2020-06-03 15:56:40 +02:00 · 2020-06-03 15:56:40 +02:00 · 8980610845
commit 8980610845
parent df534fb7df
65 changed files with 4724 additions and 4025 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -42,6 +42,7 @@ class CallBase;
 class ExtractElementInst;
 class Function;
 class GlobalValue;
+class InstCombiner;
 class IntrinsicInst;
 class LoadInst;
 class LoopAccessInfo;
@ -56,6 +57,7 @@ class TargetLibraryInfo;
 class Type;
 class User;
 class Value;
+struct KnownBits;
 template <typename T> class Optional;

 /// Information about a load/store intrinsic defined by the target.
@ -542,6 +544,29 @@ public:
  /// target-independent defaults with information from \p L and \p SE.
  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                             PeelingPreferences &PP) const;
+
+  /// Targets can implement their own combinations for target-specific
+  /// intrinsics. This function will be called from the InstCombine pass every
+  /// time a target-specific intrinsic is encountered.
+  ///
+  /// \returns None to not do anything target specific or a value that will be
+  /// returned from the InstCombiner. It is possible to return null and stop
+  /// further processing of the intrinsic by returning nullptr.
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  /// Can be used to implement target-specific instruction combining.
+  /// \see instCombineIntrinsic
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const;
+  /// Can be used to implement target-specific instruction combining.
+  /// \see instCombineIntrinsic
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
  /// @}

  /// \name Scalar Target Information
@ -1301,6 +1326,17 @@ public:
                              AssumptionCache &AC, TargetLibraryInfo *TLI,
                              DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
  virtual bool emitGetActiveLaneMask() = 0;
+  virtual Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                                       IntrinsicInst &II) = 0;
+  virtual Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) = 0;
+  virtual Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) = 0;
  virtual bool isLegalAddImmediate(int64_t Imm) = 0;
  virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@ -1588,6 +1624,26 @@ public:
  bool emitGetActiveLaneMask() override {
    return Impl.emitGetActiveLaneMask();
  }
+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) override {
+    return Impl.instCombineIntrinsic(IC, II);
+  }
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) override {
+    return Impl.simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                 KnownBitsComputed);
+  }
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) override {
+    return Impl.simplifyDemandedVectorEltsIntrinsic(
+        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
  bool isLegalAddImmediate(int64_t Imm) override {
    return Impl.isLegalAddImmediate(Imm);
  }
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -147,6 +147,26 @@ public:
    return false;
  }

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const {
+    return None;
+  }
+
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const {
+    return None;
+  }
+
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const {
+    return None;
+  }
+
  void getUnrollingPreferences(Loop *, ScalarEvolution &,
                               TTI::UnrollingPreferences &) {}

--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@ -477,6 +477,30 @@ public:
    return BaseT::emitGetActiveLaneMask();
  }

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) {
+    return BaseT::instCombineIntrinsic(IC, II);
+  }
+
+  Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
+                                                     IntrinsicInst &II,
+                                                     APInt DemandedMask,
+                                                     KnownBits &Known,
+                                                     bool &KnownBitsComputed) {
+    return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                   KnownBitsComputed);
+  }
+
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) {
+    return BaseT::simplifyDemandedVectorEltsIntrinsic(
+        IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
+
  int getInstructionLatency(const Instruction *I) {
    if (isa<LoadInst>(I))
      return getST()->getSchedModel().DefaultLoadLatency;
@ -1605,7 +1629,7 @@ public:
      }
    }

-    auto MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
+    auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
    if (MinLegalCostI != LegalCost.end())
      return *MinLegalCostI;

--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@ -199,6 +199,11 @@ public:
  /// returns Intrinsic::not_intrinsic!
  bool isIntrinsic() const { return HasLLVMReservedName; }

+  /// isTargetIntrinsic - Returns true if this function is an intrinsic and the
+  /// intrinsic is specific to a certain target. If this is not an intrinsic
+  /// or a generic intrinsic, false is returned.
+  bool isTargetIntrinsic() const;
+
  /// Returns true if the function is one of the "Constrained Floating-Point
  /// Intrinsics". Returns false if not, and returns false when
  /// getIntrinsicID() returns Intrinsic::not_intrinsic.
--- a/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/include/llvm/Transforms/InstCombine/InstCombiner.h
@ -0,0 +1,518 @@
+//===- InstCombiner.h - InstCombine implementation --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides the interface for the instcombine pass implementation.
+/// The interface is used for generic transformations in this folder and
+/// target specific combinations in the targets.
+/// The visitor implementation is in \c InstCombinerImpl in
+/// \c InstCombineInternal.h.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H
+#define LLVM_TRANSFORMS_INSTCOMBINE_INSTCOMBINER_H
+
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include <cassert>
+
+#define DEBUG_TYPE "instcombine"
+
+namespace llvm {
+
+class AAResults;
+class AssumptionCache;
+class ProfileSummaryInfo;
+class TargetLibraryInfo;
+class TargetTransformInfo;
+struct KnownBits;
+
+/// The core instruction combiner logic.
+///
+/// This class provides both the logic to recursively visit instructions and
+/// combine them.
+class LLVM_LIBRARY_VISIBILITY InstCombiner {
+  /// Only used to call target specific inst combining.
+  TargetTransformInfo &TTI;
+
+public:
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine = 0;
+
+  /// An IRBuilder that automatically inserts new instructions into the
+  /// worklist.
+  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
+  BuilderTy &Builder;
+
+protected:
+  /// A worklist of the instructions that need to be simplified.
+  InstCombineWorklist &Worklist;
+
+  // Mode in which we are running the combiner.
+  const bool MinimizeSize;
+
+  AAResults *AA;
+
+  // Required analyses.
+  AssumptionCache &AC;
+  TargetLibraryInfo &TLI;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  const SimplifyQuery SQ;
+  OptimizationRemarkEmitter &ORE;
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // Optional analyses. When non-null, these can both be used to do better
+  // combining and will be updated to reflect any changes.
+  LoopInfo *LI;
+
+  bool MadeIRChange = false;
+
+public:
+  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
+               bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+               TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+               DominatorTree &DT, OptimizationRemarkEmitter &ORE,
+               BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+               const DataLayout &DL, LoopInfo *LI)
+      : TTI(TTI), Builder(Builder), Worklist(Worklist),
+        MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL),
+        SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
+
+  virtual ~InstCombiner() {}
+
+  /// Return the source operand of a potentially bitcasted value while
+  /// optionally checking if it has one use. If there is no bitcast or the one
+  /// use check is not met, return the input value itself.
+  static Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
+    if (auto *BitCast = dyn_cast<BitCastInst>(V))
+      if (!OneUseOnly || BitCast->hasOneUse())
+        return BitCast->getOperand(0);
+
+    // V is not a bitcast or V has more than one use and OneUseOnly is true.
+    return V;
+  }
+
+  /// Assign a complexity or rank value to LLVM Values. This is used to reduce
+  /// the amount of pattern matching needed for compares and commutative
+  /// instructions. For example, if we have:
+  ///   icmp ugt X, Constant
+  /// or
+  ///   xor (add X, Constant), cast Z
+  ///
+  /// We do not have to consider the commuted variants of these patterns because
+  /// canonicalization based on complexity guarantees the above ordering.
+  ///
+  /// This routine maps IR values to various complexity ranks:
+  ///   0 -> undef
+  ///   1 -> Constants
+  ///   2 -> Other non-instructions
+  ///   3 -> Arguments
+  ///   4 -> Cast and (f)neg/not instructions
+  ///   5 -> Other instructions
+  static unsigned getComplexity(Value *V) {
+    if (isa<Instruction>(V)) {
+      if (isa<CastInst>(V) || match(V, m_Neg(PatternMatch::m_Value())) ||
+          match(V, m_Not(PatternMatch::m_Value())) ||
+          match(V, m_FNeg(PatternMatch::m_Value())))
+        return 4;
+      return 5;
+    }
+    if (isa<Argument>(V))
+      return 3;
+    return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+  }
+
+  /// Predicate canonicalization reduces the number of patterns that need to be
+  /// matched by other transforms. For example, we may swap the operands of a
+  /// conditional branch or select to create a compare with a canonical
+  /// (inverted) predicate which is then more likely to be matched with other
+  /// values.
+  static bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+    switch (Pred) {
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+    // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_OLE:
+    case CmpInst::FCMP_OGE:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  /// Given an exploded icmp instruction, return true if the comparison only
+  /// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if
+  /// the result of the comparison is true when the input value is signed.
+  static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
+                             bool &TrueIfSigned) {
+    switch (Pred) {
+    case ICmpInst::ICMP_SLT: // True if LHS s< 0
+      TrueIfSigned = true;
+      return RHS.isNullValue();
+    case ICmpInst::ICMP_SLE: // True if LHS s<= -1
+      TrueIfSigned = true;
+      return RHS.isAllOnesValue();
+    case ICmpInst::ICMP_SGT: // True if LHS s> -1
+      TrueIfSigned = false;
+      return RHS.isAllOnesValue();
+    case ICmpInst::ICMP_SGE: // True if LHS s>= 0
+      TrueIfSigned = false;
+      return RHS.isNullValue();
+    case ICmpInst::ICMP_UGT:
+      // True if LHS u> RHS and RHS == sign-bit-mask - 1
+      TrueIfSigned = true;
+      return RHS.isMaxSignedValue();
+    case ICmpInst::ICMP_UGE:
+      // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+      TrueIfSigned = true;
+      return RHS.isMinSignedValue();
+    case ICmpInst::ICMP_ULT:
+      // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
+      TrueIfSigned = false;
+      return RHS.isMinSignedValue();
+    case ICmpInst::ICMP_ULE:
+      // True if LHS u<= RHS and RHS == sign-bit-mask - 1
+      TrueIfSigned = false;
+      return RHS.isMaxSignedValue();
+    default:
+      return false;
+    }
+  }
+
+  /// Add one to a Constant
+  static Constant *AddOne(Constant *C) {
+    return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+  }
+
+  /// Subtract one from a Constant
+  static Constant *SubOne(Constant *C) {
+    return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
+  }
+
+  llvm::Optional<std::pair<
+      CmpInst::Predicate,
+      Constant *>> static getFlippedStrictnessPredicateAndConstant(CmpInst::
+                                                                       Predicate
+                                                                           Pred,
+                                                                   Constant *C);
+
+  /// Return true if the specified value is free to invert (apply ~ to).
+  /// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
+  /// is true, work under the assumption that the caller intends to remove all
+  /// uses of V and only keep uses of ~V.
+  ///
+  /// See also: canFreelyInvertAllUsersOf()
+  static bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
+    // ~(~(X)) -> X.
+    if (match(V, m_Not(PatternMatch::m_Value())))
+      return true;
+
+    // Constants can be considered to be not'ed values.
+    if (match(V, PatternMatch::m_AnyIntegralConstant()))
+      return true;
+
+    // Compares can be inverted if all of their uses are being modified to use
+    // the ~V.
+    if (isa<CmpInst>(V))
+      return WillInvertAllUses;
+
+    // If `V` is of the form `A + Constant` then `-1 - V` can be folded into
+    // `(-1 - Constant) - A` if we are willing to invert all of the uses.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+      if (BO->getOpcode() == Instruction::Add ||
+          BO->getOpcode() == Instruction::Sub)
+        if (isa<Constant>(BO->getOperand(0)) ||
+            isa<Constant>(BO->getOperand(1)))
+          return WillInvertAllUses;
+
+    // Selects with invertible operands are freely invertible
+    if (match(V,
+              m_Select(PatternMatch::m_Value(), m_Not(PatternMatch::m_Value()),
+                       m_Not(PatternMatch::m_Value()))))
+      return WillInvertAllUses;
+
+    return false;
+  }
+
+  /// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
+  /// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this
+  /// fn.
+  ///
+  /// See also: isFreeToInvert()
+  static bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
+    // Look at every user of V.
+    for (Use &U : V->uses()) {
+      if (U.getUser() == IgnoredUser)
+        continue; // Don't consider this user.
+
+      auto *I = cast<Instruction>(U.getUser());
+      switch (I->getOpcode()) {
+      case Instruction::Select:
+        if (U.getOperandNo() != 0) // Only if the value is used as select cond.
+          return false;
+        break;
+      case Instruction::Br:
+        assert(U.getOperandNo() == 0 && "Must be branching on that value.");
+        break; // Free to invert by swapping true/false values/destinations.
+      case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring
+                             // it.
+        if (!match(I, m_Not(PatternMatch::m_Value())))
+          return false; // Not a 'not'.
+        break;
+      default:
+        return false; // Don't know, likely not freely invertible.
+      }
+      // So far all users were free to invert...
+    }
+    return true; // Can freely invert all users!
+  }
+
+  /// Some binary operators require special handling to avoid poison and
+  /// undefined behavior. If a constant vector has undef elements, replace those
+  /// undefs with identity constants if possible because those are always safe
+  /// to execute. If no identity constant exists, replace undef with some other
+  /// safe constant.
+  static Constant *
+  getSafeVectorConstantForBinop(BinaryOperator::BinaryOps Opcode, Constant *In,
+                                bool IsRHSConstant) {
+    auto *InVTy = dyn_cast<VectorType>(In->getType());
+    assert(InVTy && "Not expecting scalars here");
+
+    Type *EltTy = InVTy->getElementType();
+    auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
+    if (!SafeC) {
+      // TODO: Should this be available as a constant utility function? It is
+      // similar to getBinOpAbsorber().
+      if (IsRHSConstant) {
+        switch (Opcode) {
+        case Instruction::SRem: // X % 1 = 0
+        case Instruction::URem: // X %u 1 = 0
+          SafeC = ConstantInt::get(EltTy, 1);
+          break;
+        case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
+          SafeC = ConstantFP::get(EltTy, 1.0);
+          break;
+        default:
+          llvm_unreachable(
+              "Only rem opcodes have no identity constant for RHS");
+        }
+      } else {
+        switch (Opcode) {
+        case Instruction::Shl:  // 0 << X = 0
+        case Instruction::LShr: // 0 >>u X = 0
+        case Instruction::AShr: // 0 >> X = 0
+        case Instruction::SDiv: // 0 / X = 0
+        case Instruction::UDiv: // 0 /u X = 0
+        case Instruction::SRem: // 0 % X = 0
+        case Instruction::URem: // 0 %u X = 0
+        case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
+        case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
+        case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
+        case Instruction::FRem: // 0.0 % X = 0
+          SafeC = Constant::getNullValue(EltTy);
+          break;
+        default:
+          llvm_unreachable("Expected to find identity constant for opcode");
+        }
+      }
+    }
+    assert(SafeC && "Must have safe constant for binop");
+    unsigned NumElts = InVTy->getNumElements();
+    SmallVector<Constant *, 16> Out(NumElts);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *C = In->getAggregateElement(i);
+      Out[i] = isa<UndefValue>(C) ? SafeC : C;
+    }
+    return ConstantVector::get(Out);
+  }
+
+  /// Create and insert the idiom we use to indicate a block is unreachable
+  /// without having to rewrite the CFG from within InstCombine.
+  static void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
+    auto &Ctx = InsertAt->getContext();
+    new StoreInst(ConstantInt::getTrue(Ctx),
+                  UndefValue::get(Type::getInt1PtrTy(Ctx)), InsertAt);
+  }
+
+  void addToWorklist(Instruction *I) { Worklist.push(I); }
+
+  AssumptionCache &getAssumptionCache() const { return AC; }
+  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
+  DominatorTree &getDominatorTree() const { return DT; }
+  const DataLayout &getDataLayout() const { return DL; }
+  const SimplifyQuery &getSimplifyQuery() const { return SQ; }
+  OptimizationRemarkEmitter &getOptimizationRemarkEmitter() const {
+    return ORE;
+  }
+  BlockFrequencyInfo *getBlockFrequencyInfo() const { return BFI; }
+  ProfileSummaryInfo *getProfileSummaryInfo() const { return PSI; }
+  LoopInfo *getLoopInfo() const { return LI; }
+
+  // Call target specific combiners
+  Optional<Instruction *> targetInstCombineIntrinsic(IntrinsicInst &II);
+  Optional<Value *>
+  targetSimplifyDemandedUseBitsIntrinsic(IntrinsicInst &II, APInt DemandedMask,
+                                         KnownBits &Known,
+                                         bool &KnownBitsComputed);
+  Optional<Value *> targetSimplifyDemandedVectorEltsIntrinsic(
+      IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp);
+
+  /// Inserts an instruction \p New before instruction \p Old
+  ///
+  /// Also adds the new instruction to the worklist and returns \p New so that
+  /// it is suitable for use as the return from the visitation patterns.
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && !New->getParent() &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(Old.getIterator(), New); // Insert inst
+    Worklist.push(New);
+    return New;
+  }
+
+  /// Same as InsertNewInstBefore, but also sets the debug loc.
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  /// A combiner-aware RAUW-like routine.
+  ///
+  /// This method is to be used when an instruction is found to be dead,
+  /// replaceable with another preexisting expression. Here we add all uses of
+  /// I to the worklist, replace all uses of I with the new value, then return
+  /// I, so that the inst combiner will know that I was modified.
+  Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
+    // If there are no uses to replace, then we return nullptr to indicate that
+    // no changes were made to the program.
+    if (I.use_empty())
+      return nullptr;
+
+    Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+                      << "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
+    return &I;
+  }
+
+  /// Replace operand of instruction and add old operand to the worklist.
+  Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
+    Worklist.addValue(I.getOperand(OpNum));
+    I.setOperand(OpNum, V);
+    return &I;
+  }
+
+  /// Replace use and add the previously used value to the worklist.
+  void replaceUse(Use &U, Value *NewValue) {
+    Worklist.addValue(U);
+    U = NewValue;
+  }
+
+  /// Combiner aware instruction erasure.
+  ///
+  /// When dealing with an instruction that has side effects or produces a void
+  /// value, we can't rely on DCE to delete the instruction. Instead, visit
+  /// methods should return the value returned by this function.
+  virtual Instruction *eraseInstFromFunction(Instruction &I) = 0;
+
+  void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
+                        const Instruction *CxtI) const {
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+  }
+
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedMul(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
+    return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                    const APInt &DemandedMask, KnownBits &Known,
+                                    unsigned Depth = 0) = 0;
+  virtual Value *
+  SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
+                             unsigned Depth = 0,
+                             bool AllowMultipleUsers = false) = 0;
+};
+
+} // namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -322,6 +322,29 @@ bool TargetTransformInfo::emitGetActiveLaneMask() const {
  return TTIImpl->emitGetActiveLaneMask();
 }

+Optional<Instruction *>
+TargetTransformInfo::instCombineIntrinsic(InstCombiner &IC,
+                                          IntrinsicInst &II) const {
+  return TTIImpl->instCombineIntrinsic(IC, II);
+}
+
+Optional<Value *> TargetTransformInfo::simplifyDemandedUseBitsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) const {
+  return TTIImpl->simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
+                                                   KnownBitsComputed);
+}
+
+Optional<Value *> TargetTransformInfo::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  return TTIImpl->simplifyDemandedVectorEltsIntrinsic(
+      IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+      SimplifyAndSetOp);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
    Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
  return TTIImpl->getUnrollingPreferences(L, SE, UP);
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@ -618,6 +618,10 @@ static const char * const IntrinsicNameTable[] = {
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_TARGET_DATA

+bool Function::isTargetIntrinsic() const {
+  return IntID > TargetInfos[0].Count;
+}
+
 /// Find the segment of \c IntrinsicNameTable for intrinsics with the same
 /// target as \c Name, or the generic table if \c Name is not target specific.
 ///
--- a/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@ -0,0 +1,895 @@
+//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetTransformInfo.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+  unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
+Optional<Instruction *>
+GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::amdgcn_rcp: {
+    Value *Src = II.getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II.getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    if (II.isStrictFP())
+      break;
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      const APFloat &ArgVal = C->getValueAPF();
+      APFloat Val(ArgVal.getSemantics(), 1);
+      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+      // This is more precise than the instruction may give.
+      //
+      // TODO: The instruction always flushes denormal results (except for f16),
+      // should this also?
+      return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II.getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II.getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_frexp_mant:
+  case Intrinsic::amdgcn_frexp_exp: {
+    Value *Src = II.getArgOperand(0);
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+      int Exp;
+      APFloat Significand =
+          frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
+
+      if (IID == Intrinsic::amdgcn_frexp_mant) {
+        return IC.replaceInstUsesWith(
+            II, ConstantFP::get(II.getContext(), Significand));
+      }
+
+      // Match instruction special case behavior.
+      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+        Exp = 0;
+
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+    }
+
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_class: {
+    enum {
+      S_NAN = 1 << 0,       // Signaling NaN
+      Q_NAN = 1 << 1,       // Quiet NaN
+      N_INFINITY = 1 << 2,  // Negative infinity
+      N_NORMAL = 1 << 3,    // Negative normal
+      N_SUBNORMAL = 1 << 4, // Negative subnormal
+      N_ZERO = 1 << 5,      // Negative zero
+      P_ZERO = 1 << 6,      // Positive zero
+      P_SUBNORMAL = 1 << 7, // Positive subnormal
+      P_NORMAL = 1 << 8,    // Positive normal
+      P_INFINITY = 1 << 9   // Positive infinity
+    };
+
+    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+                              N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
+                              P_NORMAL | P_INFINITY;
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+    if (!CMask) {
+      if (isa<UndefValue>(Src0)) {
+        return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+      }
+
+      if (isa<UndefValue>(Src1)) {
+        return IC.replaceInstUsesWith(II,
+                                      ConstantInt::get(II.getType(), false));
+      }
+      break;
+    }
+
+    uint32_t Mask = CMask->getZExtValue();
+
+    // If all tests are made, it doesn't matter what the value is.
+    if ((Mask & FullMask) == FullMask) {
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
+    }
+
+    if ((Mask & FullMask) == 0) {
+      return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
+    }
+
+    if (Mask == (S_NAN | Q_NAN)) {
+      // Equivalent of isnan. Replace with standard fcmp.
+      Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
+      FCmp->takeName(&II);
+      return IC.replaceInstUsesWith(II, FCmp);
+    }
+
+    if (Mask == (N_ZERO | P_ZERO)) {
+      // Equivalent of == 0.
+      Value *FCmp =
+          IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
+
+      FCmp->takeName(&II);
+      return IC.replaceInstUsesWith(II, FCmp);
+    }
+
+    // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
+    if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
+        isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
+      return IC.replaceOperand(
+          II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
+    }
+
+    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+    if (!CVal) {
+      if (isa<UndefValue>(Src0)) {
+        return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+      }
+
+      // Clamp mask to used bits
+      if ((Mask & FullMask) != Mask) {
+        CallInst *NewCall = IC.Builder.CreateCall(
+            II.getCalledFunction(),
+            {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
+
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+
+      break;
+    }
+
+    const APFloat &Val = CVal->getValueAPF();
+
+    bool Result =
+        ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+        ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+        ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+        ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+        ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+        ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+        ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+        ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+        ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+        ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+    return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
+  }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem =
+            II.getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded =
+            ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
+                                 ConstantFP::get(II.getContext(), Val1)});
+        return IC.replaceInstUsesWith(II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_cvt_pknorm_i16:
+  case Intrinsic::amdgcn_cvt_pknorm_u16:
+  case Intrinsic::amdgcn_cvt_pk_i16:
+  case Intrinsic::amdgcn_cvt_pk_u16: {
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II.getArgOperand(0);
+    if (isa<UndefValue>(Src)) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+
+    unsigned Width;
+    Type *Ty = II.getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0) {
+        return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
+      }
+
+      // Hardware ignores high bits, so remove those.
+      if (Width >= IntSize) {
+        return IC.replaceOperand(
+            II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        return IC.replaceOperand(
+            II, 1,
+            ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
+      }
+    }
+
+    bool Signed = IID == Intrinsic::amdgcn_sbfe;
+
+    if (!CWidth || !COffset)
+      break;
+
+    // The case of Width == 0 is handled above, which makes this tranformation
+    // safe.  If Width == 0, then the ashr and lshr instructions become poison
+    // value since the shift amount would be equal to the bit size.
+    assert(Width != 0);
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
+      Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
+                                 : IC.Builder.CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(&II);
+      return IC.replaceInstUsesWith(II, RightShift);
+    }
+
+    Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
+                               : IC.Builder.CreateLShr(Src, Offset);
+
+    RightShift->takeName(&II);
+    return IC.replaceInstUsesWith(II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II.getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed) {
+      return &II;
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+    Value *Src2 = II.getArgOperand(2);
+
+    // Checking for NaN before canonicalization provides better fidelity when
+    // mapping other operations onto fmed3 since the order of operands is
+    // unchanged.
+    CallInst *NewCall = nullptr;
+    if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
+      NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+    } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
+      NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+    } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
+      NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+    }
+
+    if (NewCall) {
+      NewCall->copyFastMathFlags(&II);
+      NewCall->takeName(&II);
+      return IC.replaceInstUsesWith(II, NewCall);
+    }
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II.setArgOperand(0, Src0);
+      II.setArgOperand(1, Src1);
+      II.setArgOperand(2, Src2);
+      return &II;
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return IC.replaceInstUsesWith(
+              II, ConstantFP::get(IC.Builder.getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II.getArgOperand(0);
+    Value *Src1 = II.getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        if (CCmp->isNullValue()) {
+          return IC.replaceInstUsesWith(
+              II, ConstantExpr::getSExt(CCmp, II.getType()));
+        }
+
+        // The result of V_ICMP/V_FCMP assembly instructions (which this
+        // intrinsic exposes) is one bit per thread, masked with the EXEC
+        // register (which contains the bitmask of live threads). So a
+        // comparison that always returns true is the same as a read of the
+        // EXEC register.
+        Function *NewF = Intrinsic::getDeclaration(
+            II.getModule(), Intrinsic::read_register, II.getType());
+        Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
+        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred =
+          CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II.setArgOperand(0, Src1);
+      II.setArgOperand(1, Src0);
+      II.setArgOperand(
+          2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
+      return &II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, PatternMatch::m_One()) &&
+          match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
+         (match(Src1, PatternMatch::m_AllOnes()) &&
+          match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
+      IC.replaceOperand(II, 2,
+                        ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return &II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, PatternMatch::m_Zero()) &&
+        match(Src0, PatternMatch::m_ZExtOrSExt(
+                        m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
+                              PatternMatch::m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
+                                 ? Intrinsic::amdgcn_fcmp
+                                 : Intrinsic::amdgcn_icmp;
+
+      Type *Ty = SrcLHS->getType();
+      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
+        // Promote to next legal integer type.
+        unsigned Width = CmpType->getBitWidth();
+        unsigned NewWidth = Width;
+
+        // Don't do anything for i1 comparisons.
+        if (Width == 1)
+          break;
+
+        if (Width <= 16)
+          NewWidth = 16;
+        else if (Width <= 32)
+          NewWidth = 32;
+        else if (Width <= 64)
+          NewWidth = 64;
+        else if (Width > 64)
+          break; // Can't handle this.
+
+        if (Width != NewWidth) {
+          IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
+          if (CmpInst::isSigned(SrcPred)) {
+            SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
+            SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
+          } else {
+            SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
+            SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
+          }
+        }
+      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
+        break;
+
+      Function *NewF = Intrinsic::getDeclaration(
+          II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
+      Value *Args[] = {SrcLHS, SrcRHS,
+                       ConstantInt::get(CC->getType(), SrcPred)};
+      CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+      NewCall->takeName(&II);
+      return IC.replaceInstUsesWith(II, NewCall);
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ballot: {
+    if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+      }
+
+      if (Src->isOne()) {
+        // amdgcn.ballot(i1 1) is exec.
+        const char *RegName = "exec";
+        if (II.getType()->isIntegerTy(32))
+          RegName = "exec_lo";
+        else if (!II.getType()->isIntegerTy(64))
+          break;
+
+        Function *NewF = Intrinsic::getDeclaration(
+            II.getModule(), Intrinsic::read_register, II.getType());
+        Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
+        MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(&II);
+        return IC.replaceInstUsesWith(II, NewCall);
+      }
+    }
+    break;
+  }
+  case Intrinsic::amdgcn_wqm_vote: {
+    // wqm_vote is identity when the argument is constant.
+    if (!isa<Constant>(II.getArgOperand(0)))
+      break;
+
+    return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+  }
+  case Intrinsic::amdgcn_kill: {
+    const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    if (!C || !C->getZExtValue())
+      break;
+
+    // amdgcn.kill(i1 1) is a no-op
+    return IC.eraseInstFromFunction(II);
+  }
+  case Intrinsic::amdgcn_update_dpp: {
+    Value *Old = II.getArgOperand(0);
+
+    auto *BC = cast<ConstantInt>(II.getArgOperand(5));
+    auto *RM = cast<ConstantInt>(II.getArgOperand(3));
+    auto *BM = cast<ConstantInt>(II.getArgOperand(4));
+    if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
+        BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
+      break;
+
+    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+    return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
+  }
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16: {
+    // Discard vdst_in if it's not going to be read.
+    Value *VDstIn = II.getArgOperand(0);
+    if (isa<UndefValue>(VDstIn))
+      break;
+
+    ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
+    ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
+    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
+      break;
+
+    return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
+  }
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane: {
+    // A constant value is trivially uniform.
+    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+      return IC.replaceInstUsesWith(II, C);
+    }
+
+    // The rest of these may not be safe if the exec may not be the same between
+    // the def and use.
+    Value *Src = II.getArgOperand(0);
+    Instruction *SrcInst = dyn_cast<Instruction>(Src);
+    if (SrcInst && SrcInst->getParent() != II.getParent())
+      break;
+
+    // readfirstlane (readfirstlane x) -> readfirstlane x
+    // readlane (readfirstlane x), y -> readfirstlane x
+    if (match(Src,
+              PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+      return IC.replaceInstUsesWith(II, Src);
+    }
+
+    if (IID == Intrinsic::amdgcn_readfirstlane) {
+      // readfirstlane (readlane x, y) -> readlane x, y
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+        return IC.replaceInstUsesWith(II, Src);
+      }
+    } else {
+      // readlane (readlane x, y), y -> readlane x, y
+      if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+                         PatternMatch::m_Value(),
+                         PatternMatch::m_Specific(II.getArgOperand(1))))) {
+        return IC.replaceInstUsesWith(II, Src);
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_ldexp: {
+    // FIXME: This doesn't introduce new instructions and belongs in
+    // InstructionSimplify.
+    Type *Ty = II.getType();
+    Value *Op0 = II.getArgOperand(0);
+    Value *Op1 = II.getArgOperand(1);
+
+    // Folding undef to qnan is safe regardless of the FP mode.
+    if (isa<UndefValue>(Op0)) {
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return IC.replaceInstUsesWith(II, QNaN);
+    }
+
+    const APFloat *C = nullptr;
+    match(Op0, PatternMatch::m_APFloat(C));
+
+    // FIXME: Should flush denorms depending on FP mode, but that's ignored
+    // everywhere else.
+    //
+    // These cases should be safe, even with strictfp.
+    // ldexp(0.0, x) -> 0.0
+    // ldexp(-0.0, x) -> -0.0
+    // ldexp(inf, x) -> inf
+    // ldexp(-inf, x) -> -inf
+    if (C && (C->isZero() || C->isInfinity())) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    // With strictfp, be more careful about possibly needing to flush denormals
+    // or not, and snan behavior depends on ieee_mode.
+    if (II.isStrictFP())
+      break;
+
+    if (C && C->isNaN()) {
+      // FIXME: We just need to make the nan quiet here, but that's unavailable
+      // on APFloat, only IEEEfloat
+      auto *Quieted =
+          ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
+      return IC.replaceInstUsesWith(II, Quieted);
+    }
+
+    // ldexp(x, 0) -> x
+    // ldexp(x, undef) -> x
+    if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
+      return IC.replaceInstUsesWith(II, Op0);
+    }
+
+    break;
+  }
+  }
+  return None;
+}
+
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+///
+/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
+///       struct returns.
+Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+                                             IntrinsicInst &II,
+                                             APInt DemandedElts,
+                                             int DMaskIdx = -1) {
+
+  auto *IIVTy = cast<VectorType>(II.getType());
+  unsigned VWidth = IIVTy->getNumElements();
+  if (VWidth == 1)
+    return nullptr;
+
+  IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.SetInsertPoint(&II);
+
+  // Assume the arguments are unchanged and later override them, if needed.
+  SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
+
+  if (DMaskIdx < 0) {
+    // Buffer case.
+
+    const unsigned ActiveBits = DemandedElts.getActiveBits();
+    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+
+    // Start assuming the prefix of elements is demanded, but possibly clear
+    // some other bits if there are trailing zeros (unused components at front)
+    // and update offset.
+    DemandedElts = (1 << ActiveBits) - 1;
+
+    if (UnusedComponentsAtFront > 0) {
+      static const unsigned InvalidOffsetIdx = 0xf;
+
+      unsigned OffsetIdx;
+      switch (II.getIntrinsicID()) {
+      case Intrinsic::amdgcn_raw_buffer_load:
+        OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_s_buffer_load:
+        // If resulting type is vec3, there is no point in trimming the
+        // load with updated offset, as the vec3 would most likely be widened to
+        // vec4 anyway during lowering.
+        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
+          OffsetIdx = InvalidOffsetIdx;
+        else
+          OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_struct_buffer_load:
+        OffsetIdx = 2;
+        break;
+      default:
+        // TODO: handle tbuffer* intrinsics.
+        OffsetIdx = InvalidOffsetIdx;
+        break;
+      }
+
+      if (OffsetIdx != InvalidOffsetIdx) {
+        // Clear demanded bits and update the offset.
+        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
+        auto *Offset = II.getArgOperand(OffsetIdx);
+        unsigned SingleComponentSizeInBits =
+            IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
+        unsigned OffsetAdd =
+            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
+        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+        Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+      }
+    }
+  } else {
+    // Image case.
+
+    ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
+    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+    // Mask off values that are undefined because the dmask doesn't cover them
+    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+    unsigned NewDMaskVal = 0;
+    unsigned OrigLoadIdx = 0;
+    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+      const unsigned Bit = 1 << SrcIdx;
+      if (!!(DMaskVal & Bit)) {
+        if (!!DemandedElts[OrigLoadIdx])
+          NewDMaskVal |= Bit;
+        OrigLoadIdx++;
+      }
+    }
+
+    if (DMaskVal != NewDMaskVal)
+      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
+  }
+
+  unsigned NewNumElts = DemandedElts.countPopulation();
+  if (!NewNumElts)
+    return UndefValue::get(II.getType());
+
+  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
+  // fully supported.
+  if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
+    return nullptr;
+
+  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+    if (DMaskIdx >= 0)
+      II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
+    return nullptr;
+  }
+
+  // Validate function argument and return types, extracting overloaded types
+  // along the way.
+  SmallVector<Type *, 6> OverloadTys;
+  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
+    return nullptr;
+
+  Module *M = II.getParent()->getParent()->getParent();
+  Type *EltTy = IIVTy->getElementType();
+  Type *NewTy =
+      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
+
+  OverloadTys[0] = NewTy;
+  Function *NewIntrin =
+      Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
+
+  CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
+  NewCall->takeName(&II);
+  NewCall->copyMetadata(II);
+
+  if (NewNumElts == 1) {
+    return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
+                                          NewCall,
+                                          DemandedElts.countTrailingZeros());
+  }
+
+  SmallVector<int, 8> EltMask;
+  unsigned NewLoadIdx = 0;
+  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+    if (!!DemandedElts[OrigLoadIdx])
+      EltMask.push_back(NewLoadIdx++);
+    else
+      EltMask.push_back(NewNumElts);
+  }
+
+  Value *Shuffle =
+      IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
+
+  return Shuffle;
+}
+
+Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+    APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  switch (II.getIntrinsicID()) {
+  case Intrinsic::amdgcn_buffer_load:
+  case Intrinsic::amdgcn_buffer_load_format:
+  case Intrinsic::amdgcn_raw_buffer_load:
+  case Intrinsic::amdgcn_raw_buffer_load_format:
+  case Intrinsic::amdgcn_raw_tbuffer_load:
+  case Intrinsic::amdgcn_s_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load_format:
+  case Intrinsic::amdgcn_struct_tbuffer_load:
+  case Intrinsic::amdgcn_tbuffer_load:
+    return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
+  default: {
+    if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
+      return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
+    }
+    break;
+  }
+  }
+  return None;
+}
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -33,6 +33,7 @@
 namespace llvm {

 class AMDGPUTargetLowering;
+class InstCombiner;
 class Loop;
 class ScalarEvolution;
 class Type;
@ -223,6 +224,14 @@ public:
  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
                                          Value *NewV) const;

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
  unsigned getVectorSplitCost() { return 0; }

  unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@ -34,6 +34,10 @@ tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget)

 add_public_tablegen_target(AMDGPUCommonTableGen)

+set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
+tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
+add_public_tablegen_target(InstCombineTableGen)
+
 add_llvm_target(AMDGPUCodeGen
  AMDGPUAliasAnalysis.cpp
  AMDGPUAlwaysInlinePass.cpp
@ -48,6 +52,7 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPUFixFunctionBitcasts.cpp
  AMDGPUFrameLowering.cpp
  AMDGPUHSAMetadataStreamer.cpp
+  AMDGPUInstCombineIntrinsic.cpp
  AMDGPUInstrInfo.cpp
  AMDGPUInstructionSelector.cpp
  AMDGPUISelDAGToDAG.cpp
--- a/lib/Transforms/InstCombine/InstCombineTables.td
+++ b/lib/Transforms/InstCombine/InstCombineTables.td
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -28,6 +28,8 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
@ -50,6 +52,28 @@ extern cl::opt<TailPredication::Mode> EnableTailPredication;

 extern cl::opt<bool> EnableMaskedGatherScatters;

+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
+                               InstCombiner::BuilderTy &Builder) {
+  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+  if (!IntrAlign)
+    return nullptr;
+
+  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
+                           ? MemAlign
+                           : IntrAlign->getLimitedValue();
+
+  if (!isPowerOf2_32(Alignment))
+    return nullptr;
+
+  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::get(II.getType(), 0));
+  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
+}
+
 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const {
  const TargetMachine &TM = getTLI()->getTargetMachine();
@ -82,6 +106,114 @@ bool ARMTTIImpl::shouldFavorPostInc() const {
  return false;
 }

+Optional<Instruction *>
+ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::arm_neon_vld1: {
+    Align MemAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
+      return IC.replaceInstUsesWith(II, V);
+    }
+    break;
+  }
+
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Align MemAlign =
+        getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
+                          &IC.getAssumptionCache(), &IC.getDominatorTree());
+    unsigned AlignArg = II.getNumArgOperands() - 1;
+    Value *AlignArgOp = II.getArgOperand(AlignArg);
+    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
+    if (Align && *Align < MemAlign) {
+      return IC.replaceOperand(
+          II, AlignArg,
+          ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
+                           false));
+    }
+    break;
+  }
+
+  case Intrinsic::arm_mve_pred_i2v: {
+    Value *Arg = II.getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+                       PatternMatch::m_Value(ArgArg))) &&
+        II.getType() == ArgArg->getType()) {
+      return IC.replaceInstUsesWith(II, ArgArg);
+    }
+    Constant *XorMask;
+    if (match(Arg, m_Xor(PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(
+                             PatternMatch::m_Value(ArgArg)),
+                         PatternMatch::m_Constant(XorMask))) &&
+        II.getType() == ArgArg->getType()) {
+      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+        if (CI->getValue().trunc(16).isAllOnesValue()) {
+          auto TrueVector = IC.Builder.CreateVectorSplat(
+              cast<VectorType>(II.getType())->getNumElements(),
+              IC.Builder.getTrue());
+          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+        }
+      }
+    }
+    KnownBits ScalarKnown(32);
+    if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
+                                ScalarKnown, 0)) {
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_pred_v2i: {
+    Value *Arg = II.getArgOperand(0);
+    Value *ArgArg;
+    if (match(Arg, PatternMatch::m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(
+                       PatternMatch::m_Value(ArgArg)))) {
+      return IC.replaceInstUsesWith(II, ArgArg);
+    }
+    if (!II.getMetadata(LLVMContext::MD_range)) {
+      Type *IntTy32 = Type::getInt32Ty(II.getContext());
+      Metadata *M[] = {
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
+          ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))};
+      II.setMetadata(LLVMContext::MD_range, MDNode::get(II.getContext(), M));
+      return &II;
+    }
+    break;
+  }
+  case Intrinsic::arm_mve_vadc:
+  case Intrinsic::arm_mve_vadc_predicated: {
+    unsigned CarryOp =
+        (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
+    assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
+           "Bad type for intrinsic!");
+
+    KnownBits CarryKnown(32);
+    if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
+                                CarryKnown)) {
+      return &II;
+    }
+    break;
+  }
+  }
+  return None;
+}
+
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                              TTI::TargetCostKind CostKind) {
  assert(Ty->isIntegerTy());
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@ -113,6 +113,9 @@ public:
    return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
  }

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
  /// \name Scalar TTI Implementations
  /// @{

--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@ -111,6 +111,263 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
  return false;
 }

+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+  // Each NVVM intrinsic we can simplify can be replaced with one of:
+  //
+  //  * an LLVM intrinsic,
+  //  * an LLVM cast operation,
+  //  * an LLVM binary operation, or
+  //  * ad-hoc LLVM IR for the particular operation.
+
+  // Some transformations are only valid when the module's
+  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+  // transformations are valid regardless of the module's ftz setting.
+  enum FtzRequirementTy {
+    FTZ_Any,       // Any ftz setting is ok.
+    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
+    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+  };
+  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+  // simplify.
+  enum SpecialCase {
+    SPC_Reciprocal,
+  };
+
+  // SimplifyAction is a poor-man's variant (plus an additional flag) that
+  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+  struct SimplifyAction {
+    // Invariant: At most one of these Optionals has a value.
+    Optional<Intrinsic::ID> IID;
+    Optional<Instruction::CastOps> CastOp;
+    Optional<Instruction::BinaryOps> BinaryOp;
+    Optional<SpecialCase> Special;
+
+    FtzRequirementTy FtzRequirement = FTZ_Any;
+
+    SimplifyAction() = default;
+
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+        : IID(IID), FtzRequirement(FtzReq) {}
+
+    // Cast operations don't have anything to do with FTZ, so we skip that
+    // argument.
+    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+        : Special(Special), FtzRequirement(FtzReq) {}
+  };
+
+  // Try to generate a SimplifyAction describing how to replace our
+  // IntrinsicInstr with target-generic LLVM IR.
+  const SimplifyAction Action = [II]() -> SimplifyAction {
+    switch (II->getIntrinsicID()) {
+    // NVVM intrinsics that map directly to LLVM intrinsics.
+    case Intrinsic::nvvm_ceil_d:
+      return {Intrinsic::ceil, FTZ_Any};
+    case Intrinsic::nvvm_ceil_f:
+      return {Intrinsic::ceil, FTZ_MustBeOff};
+    case Intrinsic::nvvm_ceil_ftz_f:
+      return {Intrinsic::ceil, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fabs_d:
+      return {Intrinsic::fabs, FTZ_Any};
+    case Intrinsic::nvvm_fabs_f:
+      return {Intrinsic::fabs, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fabs_ftz_f:
+      return {Intrinsic::fabs, FTZ_MustBeOn};
+    case Intrinsic::nvvm_floor_d:
+      return {Intrinsic::floor, FTZ_Any};
+    case Intrinsic::nvvm_floor_f:
+      return {Intrinsic::floor, FTZ_MustBeOff};
+    case Intrinsic::nvvm_floor_ftz_f:
+      return {Intrinsic::floor, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_d:
+      return {Intrinsic::fma, FTZ_Any};
+    case Intrinsic::nvvm_fma_rn_f:
+      return {Intrinsic::fma, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fma_rn_ftz_f:
+      return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_d:
+      return {Intrinsic::maxnum, FTZ_Any};
+    case Intrinsic::nvvm_fmax_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_d:
+      return {Intrinsic::minnum, FTZ_Any};
+    case Intrinsic::nvvm_fmin_f:
+      return {Intrinsic::minnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_f:
+      return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_round_d:
+      return {Intrinsic::round, FTZ_Any};
+    case Intrinsic::nvvm_round_f:
+      return {Intrinsic::round, FTZ_MustBeOff};
+    case Intrinsic::nvvm_round_ftz_f:
+      return {Intrinsic::round, FTZ_MustBeOn};
+    case Intrinsic::nvvm_sqrt_rn_d:
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_f:
+      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
+      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
+      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
+      // the versions with explicit ftz-ness.
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_rn_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOff};
+    case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOn};
+    case Intrinsic::nvvm_trunc_d:
+      return {Intrinsic::trunc, FTZ_Any};
+    case Intrinsic::nvvm_trunc_f:
+      return {Intrinsic::trunc, FTZ_MustBeOff};
+    case Intrinsic::nvvm_trunc_ftz_f:
+      return {Intrinsic::trunc, FTZ_MustBeOn};
+
+    // NVVM intrinsics that map to LLVM cast operations.
+    //
+    // Note that llvm's target-generic conversion operators correspond to the rz
+    // (round to zero) versions of the nvvm conversion intrinsics, even though
+    // most everything else here uses the rn (round to nearest even) nvvm ops.
+    case Intrinsic::nvvm_d2i_rz:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_d2ll_rz:
+    case Intrinsic::nvvm_f2ll_rz:
+      return {Instruction::FPToSI};
+    case Intrinsic::nvvm_d2ui_rz:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_d2ull_rz:
+    case Intrinsic::nvvm_f2ull_rz:
+      return {Instruction::FPToUI};
+    case Intrinsic::nvvm_i2d_rz:
+    case Intrinsic::nvvm_i2f_rz:
+    case Intrinsic::nvvm_ll2d_rz:
+    case Intrinsic::nvvm_ll2f_rz:
+      return {Instruction::SIToFP};
+    case Intrinsic::nvvm_ui2d_rz:
+    case Intrinsic::nvvm_ui2f_rz:
+    case Intrinsic::nvvm_ull2d_rz:
+    case Intrinsic::nvvm_ull2f_rz:
+      return {Instruction::UIToFP};
+
+    // NVVM intrinsics that map to LLVM binary ops.
+    case Intrinsic::nvvm_add_rn_d:
+      return {Instruction::FAdd, FTZ_Any};
+    case Intrinsic::nvvm_add_rn_f:
+      return {Instruction::FAdd, FTZ_MustBeOff};
+    case Intrinsic::nvvm_add_rn_ftz_f:
+      return {Instruction::FAdd, FTZ_MustBeOn};
+    case Intrinsic::nvvm_mul_rn_d:
+      return {Instruction::FMul, FTZ_Any};
+    case Intrinsic::nvvm_mul_rn_f:
+      return {Instruction::FMul, FTZ_MustBeOff};
+    case Intrinsic::nvvm_mul_rn_ftz_f:
+      return {Instruction::FMul, FTZ_MustBeOn};
+    case Intrinsic::nvvm_div_rn_d:
+      return {Instruction::FDiv, FTZ_Any};
+    case Intrinsic::nvvm_div_rn_f:
+      return {Instruction::FDiv, FTZ_MustBeOff};
+    case Intrinsic::nvvm_div_rn_ftz_f:
+      return {Instruction::FDiv, FTZ_MustBeOn};
+
+    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+    // need special handling.
+    //
+    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
+    // as well.
+    case Intrinsic::nvvm_rcp_rn_d:
+      return {SPC_Reciprocal, FTZ_Any};
+    case Intrinsic::nvvm_rcp_rn_f:
+      return {SPC_Reciprocal, FTZ_MustBeOff};
+    case Intrinsic::nvvm_rcp_rn_ftz_f:
+      return {SPC_Reciprocal, FTZ_MustBeOn};
+
+      // We do not currently simplify intrinsics that give an approximate
+      // answer. These include:
+      //
+      //   - nvvm_cos_approx_{f,ftz_f}
+      //   - nvvm_ex2_approx_{d,f,ftz_f}
+      //   - nvvm_lg2_approx_{d,f,ftz_f}
+      //   - nvvm_sin_approx_{f,ftz_f}
+      //   - nvvm_sqrt_approx_{f,ftz_f}
+      //   - nvvm_rsqrt_approx_{d,f,ftz_f}
+      //   - nvvm_div_approx_{ftz_d,ftz_f,f}
+      //   - nvvm_rcp_approx_ftz_d
+      //
+      // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+      // means that fastmath is enabled in the intrinsic.  Unfortunately only
+      // binary operators (currently) have a fastmath bit in SelectionDAG, so
+      // this information gets lost and we can't select on it.
+      //
+      // TODO: div and rcp are lowered to a binary op, so these we could in
+      // theory lower them to "fast fdiv".
+
+    default:
+      return {};
+    }
+  }();
+
+  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+  // can bail out now.  (Notice that in the case that IID is not an NVVM
+  // intrinsic, we don't have to look up any module metadata, as
+  // FtzRequirementTy will be FTZ_Any.)
+  if (Action.FtzRequirement != FTZ_Any) {
+    StringRef Attr = II->getFunction()
+                         ->getFnAttribute("denormal-fp-math-f32")
+                         .getValueAsString();
+    DenormalMode Mode = parseDenormalFPAttribute(Attr);
+    bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
+
+    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+      return nullptr;
+  }
+
+  // Simplify to target-generic intrinsic.
+  if (Action.IID) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    // All the target-generic intrinsics currently of interest to us have one
+    // type argument, equal to that of the nvvm intrinsic's argument.
+    Type *Tys[] = {II->getArgOperand(0)->getType()};
+    return CallInst::Create(
+        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+  }
+
+  // Simplify to target-generic binary op.
+  if (Action.BinaryOp)
+    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+                                  II->getArgOperand(1), II->getName());
+
+  // Simplify to target-generic cast op.
+  if (Action.CastOp)
+    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+                            II->getName());
+
+  // All that's left are the special cases.
+  if (!Action.Special)
+    return nullptr;
+
+  switch (*Action.Special) {
+  case SPC_Reciprocal:
+    // Simplify reciprocal.
+    return BinaryOperator::Create(
+        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+        II->getArgOperand(0), II->getName());
+  }
+  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
+Optional<Instruction *>
+NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
+    return I;
+  }
+  return None;
+}
+
 int NVPTXTTIImpl::getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    TTI::OperandValueKind Opd1Info,
--- a/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@ -48,6 +48,9 @@ public:
    return AddressSpace::ADDRESS_SPACE_GENERIC;
  }

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
  // Loads and stores can be vectorized if the alignment is at least as big as
  // the load/store we want to vectorize.
  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@ -13,8 +13,11 @@
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/IntrinsicsPowerPC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;

 #define DEBUG_TYPE "ppctti"
@ -59,6 +62,158 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
  return TTI::PSK_Software;
 }

+Optional<Instruction *>
+PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+  Intrinsic::ID IID = II.getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+    // Turn PPC lvx -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Value *Ptr = IC.Builder.CreateBitCast(
+          II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+      return new LoadInst(II.getType(), Ptr, "", false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
+                                          PointerType::getUnqual(II.getType()));
+    return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+    return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
+  }
+  case Intrinsic::ppc_qpx_qvlfs:
+    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *VTy =
+          VectorType::get(IC.Builder.getFloatTy(),
+                          cast<VectorType>(II.getType())->getElementCount());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
+                                            PointerType::getUnqual(VTy));
+      Value *Load = IC.Builder.CreateLoad(VTy, Ptr);
+      return new FPExtInst(Load, II.getType());
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvlfd:
+    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(0), Align(32), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) {
+      Value *Ptr = IC.Builder.CreateBitCast(
+          II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+      return new LoadInst(II.getType(), Ptr, "", false, Align(32));
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfs:
+    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
+      Type *VTy = VectorType::get(
+          IC.Builder.getFloatTy(),
+          cast<VectorType>(II.getArgOperand(0)->getType())->getElementCount());
+      Value *TOp = IC.Builder.CreateFPTrunc(II.getArgOperand(0), VTy);
+      Type *OpPtrTy = PointerType::getUnqual(VTy);
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(TOp, Ptr, false, Align(16));
+    }
+    break;
+  case Intrinsic::ppc_qpx_qvstfd:
+    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(
+            II.getArgOperand(1), Align(32), IC.getDataLayout(), &II,
+            &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 32) {
+      Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
+      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      return new StoreInst(II.getArgOperand(0), Ptr, false, Align(32));
+    }
+    break;
+
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
+    // a vectorshuffle for little endian, we must undo the transformation
+    // performed on vec_perm in altivec.h.  That is, we must complement
+    // the permutation mask with respect to 31 and reverse the order of
+    // V1 and V2.
+    if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
+      assert(cast<VectorType>(Mask->getType())->getNumElements() == 16 &&
+             "Bad type for intrinsic!");
+
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        Constant *Elt = Mask->getAggregateElement(i);
+        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 =
+            IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
+        Value *Op1 =
+            IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
+        Value *Result = UndefValue::get(Op0->getType());
+
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getAggregateElement(i)))
+            continue;
+          unsigned Idx =
+              cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
+          Idx &= 31; // Match the hardware behavior.
+          if (DL.isLittleEndian())
+            Idx = 31 - Idx;
+
+          if (!ExtractedElts[Idx]) {
+            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
+            ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
+                Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
+          }
+
+          // Insert this value into the result vector.
+          Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
+                                                  IC.Builder.getInt32(i));
+        }
+        return CastInst::Create(Instruction::BitCast, Result, II.getType());
+      }
+    }
+    break;
+  }
+  return None;
+}
+
 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                              TTI::TargetCostKind CostKind) {
  if (DisablePPCConstHoist)
@ -849,7 +1004,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
      // The cost of the load constant for a vector extract is disregarded
      // (invariant, easily schedulable).
      return vectorCostAdjustment(1, Opcode, Val, nullptr);
-      
+
    } else if (ST->hasDirectMove())
      // Assume permute has standard cost.
      // Assume move-to/move-from VSR have 2x standard cost.
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@ -41,6 +41,9 @@ public:
      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
        TLI(ST->getTargetLowering()) {}

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+
  /// \name Scalar TTI Implementations
  /// @{

--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@ -47,6 +47,7 @@ set(sources
  X86IndirectThunks.cpp
  X86InterleavedAccess.cpp
  X86InsertPrefetch.cpp
+  X86InstCombineIntrinsic.cpp
  X86InstrFMA3Info.cpp
  X86InstrFoldTables.cpp
  X86InstrInfo.cpp
--- a/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/lib/Target/X86/X86InstCombineIntrinsic.cpp
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@ -22,6 +22,8 @@

 namespace llvm {

+class InstCombiner;
+
 class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
  typedef BasicTTIImplBase<X86TTIImpl> BaseT;
  typedef TargetTransformInfo TTI;
@ -151,6 +153,18 @@ public:
  int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                const SCEV *Ptr);

+  Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
+                                               IntrinsicInst &II) const;
+  Optional<Value *>
+  simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
+                                   APInt DemandedMask, KnownBits &Known,
+                                   bool &KnownBitsComputed) const;
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
  unsigned getAtomicMemIntrinsicMaxElementSize() const;

  int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@ -1,7 +1,3 @@
-set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
-tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
-add_public_tablegen_target(InstCombineTableGen)
-
 add_llvm_component_library(LLVMInstCombine
  InstructionCombining.cpp
  InstCombineAddSub.cpp
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@ -29,6 +29,7 @@
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>

@ -860,7 +861,7 @@ static Instruction *foldNoWrapAdd(BinaryOperator &Add,
  return nullptr;
 }

-Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
+Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
  Constant *Op1C;
  if (!match(Op1, m_Constant(Op1C)))
@ -886,15 +887,15 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) {
  // zext(bool) + C -> bool ? C + 1 : C
  if (match(Op0, m_ZExt(m_Value(X))) &&
      X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, AddOne(Op1C), Op1);
+    return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
  // sext(bool) + C -> bool ? C - 1 : C
  if (match(Op0, m_SExt(m_Value(X))) &&
      X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, SubOne(Op1C), Op1);
+    return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);

  // ~X + C --> (C-1) - X
  if (match(Op0, m_Not(m_Value(X))))
-    return BinaryOperator::CreateSub(SubOne(Op1C), X);
+    return BinaryOperator::CreateSub(InstCombiner::SubOne(Op1C), X);

  const APInt *C;
  if (!match(Op1, m_APInt(C)))
@ -1021,7 +1022,7 @@ static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {

 // Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
 // does not overflow.
-Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
+Value *InstCombinerImpl::SimplifyAddWithRemainder(BinaryOperator &I) {
  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
  Value *X, *MulOpV;
  APInt C0, MulOpC;
@ -1097,9 +1098,9 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *
-InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
-    BinaryOperator &I) {
+Instruction *InstCombinerImpl::
+    canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+        BinaryOperator &I) {
  assert((I.getOpcode() == Instruction::Add ||
          I.getOpcode() == Instruction::Or ||
          I.getOpcode() == Instruction::Sub) &&
@ -1198,7 +1199,7 @@ InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
  return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
 }

-Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
  if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                 SQ.getWithInstruction(&I)))
@ -1486,7 +1487,7 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
                : BinaryOperator::CreateFDivFMF(XY, Z, &I);
 }

-Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
  if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
                                  I.getFastMathFlags(),
                                  SQ.getWithInstruction(&I)))
@ -1600,8 +1601,8 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 /// Optimize pointer differences into the same array into a size.  Consider:
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
-Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               Type *Ty, bool IsNUW) {
+Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
+                                                   Type *Ty, bool IsNUW) {
  // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
  // this.
  bool Swapped = false;
@ -1692,7 +1693,7 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
  return Builder.CreateIntCast(Result, Ty, true);
 }

-Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
  if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                 SQ.getWithInstruction(&I)))
@ -1806,14 +1807,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
    Value *X;
    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
      // C - (zext bool) --> bool ? C - 1 : C
-      return SelectInst::Create(X, SubOne(C), C);
+      return SelectInst::Create(X, InstCombiner::SubOne(C), C);
    if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
      // C - (sext bool) --> bool ? C + 1 : C
-      return SelectInst::Create(X, AddOne(C), C);
+      return SelectInst::Create(X, InstCombiner::AddOne(C), C);

    // C - ~X == X + (1+C)
    if (match(Op1, m_Not(m_Value(X))))
-      return BinaryOperator::CreateAdd(X, AddOne(C));
+      return BinaryOperator::CreateAdd(X, InstCombiner::AddOne(C));

    // Try to fold constant sub into select arguments.
    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
@ -2094,11 +2095,11 @@ static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
  return nullptr;
 }

-Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
+Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
  Value *Op = I.getOperand(0);

  if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
+                                  getSimplifyQuery().getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);

  if (Instruction *X = foldFNegIntoConstant(I))
@ -2117,10 +2118,10 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
  if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
                                  I.getFastMathFlags(),
-                                  SQ.getWithInstruction(&I)))
+                                  getSimplifyQuery().getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);

  if (Instruction *X = foldVectorBinop(I))
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@ -13,10 +13,12 @@
 #include "InstCombineInternal.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 using namespace PatternMatch;

@ -114,10 +116,9 @@ static Value *SimplifyBSwap(BinaryOperator &I,

 /// This handles expressions of the form ((val OP C1) & C2).  Where
 /// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
-Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
-                                    ConstantInt *OpRHS,
-                                    ConstantInt *AndRHS,
-                                    BinaryOperator &TheAnd) {
+Instruction *InstCombinerImpl::OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
+                                        ConstantInt *AndRHS,
+                                        BinaryOperator &TheAnd) {
  Value *X = Op->getOperand(0);

  switch (Op->getOpcode()) {
@ -161,8 +162,9 @@ Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
 /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
 /// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
 /// whether to treat V, Lo, and Hi as signed or not.
-Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
-                                     bool isSigned, bool Inside) {
+Value *InstCombinerImpl::insertRangeTest(Value *V, const APInt &Lo,
+                                         const APInt &Hi, bool isSigned,
+                                         bool Inside) {
  assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
         "Lo is not < Hi in range emission code!");

@ -437,11 +439,10 @@ getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
 /// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
 /// and the right hand side is of type BMask_Mixed. For example,
 /// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
-static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-    Value *A, Value *B, Value *C, Value *D, Value *E,
-    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
-    llvm::InstCombiner::BuilderTy &Builder) {
+static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
+    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    InstCombiner::BuilderTy &Builder) {
  // We are given the canonical form:
  //   (icmp ne (A & B), 0) & (icmp eq (A & D), E).
  // where D & E == E.
@ -568,11 +569,9 @@ static Value * foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
 /// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
 /// aren't of the common mask pattern type.
 static Value *foldLogOpOfMaskedICmpsAsymmetric(
-    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-    Value *A, Value *B, Value *C, Value *D, Value *E,
-    ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
-    unsigned LHSMask, unsigned RHSMask,
-    llvm::InstCombiner::BuilderTy &Builder) {
+    ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
+    Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
+    unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) {
  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
         "Expected equality predicates for masked type of icmps.");
  // Handle Mask_NotAllZeros-BMask_Mixed cases.
@ -603,7 +602,7 @@ static Value *foldLogOpOfMaskedICmpsAsymmetric(
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-                                     llvm::InstCombiner::BuilderTy &Builder) {
+                                     InstCombiner::BuilderTy &Builder) {
  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
  Optional<std::pair<unsigned, unsigned>> MaskPair =
@ -748,8 +747,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
 /// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
 /// If \p Inverted is true then the check is for the inverted range, e.g.
 /// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
-Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
-                                        bool Inverted) {
+Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                            bool Inverted) {
  // Check the lower range comparison, e.g. x >= 0
  // InstCombine already ensured that if there is a constant it's on the RHS.
  ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
@ -856,8 +855,9 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,

 // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
 // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
-Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                                   BinaryOperator &Logic) {
+Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
+                                                       ICmpInst *RHS,
+                                                       BinaryOperator &Logic) {
  bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
  assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
         "Wrong opcode");
@ -1184,8 +1184,8 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
 }

 /// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    BinaryOperator &And) {
+Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                        BinaryOperator &And) {
  const SimplifyQuery Q = SQ.getWithInstruction(&And);

  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
@ -1404,7 +1404,8 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
  return nullptr;
 }

-Value *InstCombiner::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd) {
+Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
+                                          bool IsAnd) {
  Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
  Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
  FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@ -1514,8 +1515,8 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
  Value *A, *B;
  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
-      !isFreeToInvert(A, A->hasOneUse()) &&
-      !isFreeToInvert(B, B->hasOneUse())) {
+      !InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
+      !InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
    return BinaryOperator::CreateNot(AndOr);
  }
@ -1523,7 +1524,7 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I,
  return nullptr;
 }

-bool InstCombiner::shouldOptimizeCast(CastInst *CI) {
+bool InstCombinerImpl::shouldOptimizeCast(CastInst *CI) {
  Value *CastSrc = CI->getOperand(0);

  // Noop casts and casts of constants should be eliminated trivially.
@ -1579,7 +1580,7 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
 }

 /// Fold {and,or,xor} (cast X), Y.
-Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
  auto LogicOpc = I.getOpcode();
  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");

@ -1725,7 +1726,7 @@ static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {

 /// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
 /// a common zext operand: and (binop (zext X), C), (zext X).
-Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
+Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) {
  // This transform could also apply to {or, and, xor}, but there are better
  // folds for those cases, so we don't expect those patterns here. AShr is not
  // handled because it should always be transformed to LShr in this sequence.
@ -1767,7 +1768,7 @@ Instruction *InstCombiner::narrowMaskedBinOp(BinaryOperator &And) {
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
  if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
                                 SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -2033,7 +2034,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
+Instruction *InstCombinerImpl::matchBSwap(BinaryOperator &Or) {
  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);

@ -2216,7 +2217,7 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
+Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
  // Step 1: We may have peeked through bitcasts in the caller.
  // Exit immediately if we don't have (vector) integer types.
  Type *Ty = A->getType();
@ -2273,8 +2274,8 @@ Value *InstCombiner::getSelectCondition(Value *A, Value *B) {

 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
-                                          Value *D) {
+Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+                                              Value *D) {
  // The potential condition of the select may be bitcasted. In that case, look
  // through its bitcast and the corresponding bitcast of the 'not' condition.
  Type *OrigType = A->getType();
@ -2294,8 +2295,8 @@ Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
 }

 /// Fold (icmp)|(icmp) if possible.
-Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                   BinaryOperator &Or) {
+Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                       BinaryOperator &Or) {
  const SimplifyQuery Q = SQ.getWithInstruction(&Or);

  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
@ -2560,7 +2561,7 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
  if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
                                SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -2929,8 +2930,8 @@ static Instruction *foldXorToXor(BinaryOperator &I,
  return nullptr;
 }

-Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    BinaryOperator &I) {
+Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                        BinaryOperator &I) {
  assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
         I.getOperand(1) == RHS && "Should be 'xor' with these operands");

@ -3088,9 +3089,9 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I,
    return nullptr;

  // We only want to do the transform if it is free to do.
-  if (isFreeToInvert(X, X->hasOneUse())) {
+  if (InstCombiner::isFreeToInvert(X, X->hasOneUse())) {
    // Ok, good.
-  } else if (isFreeToInvert(Y, Y->hasOneUse())) {
+  } else if (InstCombiner::isFreeToInvert(Y, Y->hasOneUse())) {
    std::swap(X, Y);
  } else
    return nullptr;
@ -3102,7 +3103,7 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I,
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
-Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
  if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
                                 SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
--- a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@ -9,8 +9,10 @@
 // This file implements the visit functions for atomic rmw instructions.
 //
 //===----------------------------------------------------------------------===//
+
 #include "InstCombineInternal.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"

 using namespace llvm;

@ -30,7 +32,7 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) {
    default:
      return false;
    };
-  
+
  auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
  if(!C)
    return false;
@ -93,11 +95,11 @@ bool isSaturating(AtomicRMWInst& RMWI) {
 }
 }

-Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
+Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {

  // Volatile RMWs perform a load and a store, we cannot replace this by just a
  // load or just a store. We chose not to canonicalize out of general paranoia
-  // about user expectations around volatile. 
+  // about user expectations around volatile.
  if (RMWI.isVolatile())
    return nullptr;

@ -115,7 +117,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
         "AtomicRMWs don't make sense with Unordered or NotAtomic");

  // Any atomicrmw xchg with no uses can be converted to a atomic store if the
-  // ordering is compatible. 
+  // ordering is compatible.
  if (RMWI.getOperation() == AtomicRMWInst::Xchg &&
      RMWI.use_empty()) {
    if (Ordering != AtomicOrdering::Release &&
@ -127,14 +129,14 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
    SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
    return eraseInstFromFunction(RMWI);
  }
-  
+
  if (!isIdempotentRMW(RMWI))
    return nullptr;

  // We chose to canonicalize all idempotent operations to an single
  // operation code and constant.  This makes it easier for the rest of the
  // optimizer to match easily.  The choices of or w/0 and fadd w/-0.0 are
-  // arbitrary. 
+  // arbitrary.
  if (RMWI.getType()->isIntegerTy() &&
      RMWI.getOperation() != AtomicRMWInst::Or) {
    RMWI.setOperation(AtomicRMWInst::Or);
@ -149,7 +151,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
  if (Ordering != AtomicOrdering::Acquire &&
      Ordering != AtomicOrdering::Monotonic)
    return nullptr;
-  
+
  LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
                                false, DL.getABITypeAlign(RMWI.getType()),
                                Ordering, RMWI.getSyncScopeID());
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@ -14,10 +14,11 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <numeric>
 using namespace llvm;
 using namespace PatternMatch;
@ -81,8 +82,8 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,

 /// If we find a cast of an allocation instruction, try to eliminate the cast by
 /// moving the type information into the alloc.
-Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
-                                                   AllocaInst &AI) {
+Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
+                                                       AllocaInst &AI) {
  PointerType *PTy = cast<PointerType>(CI.getType());

  IRBuilderBase::InsertPointGuard Guard(Builder);
@ -160,8 +161,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,

 /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
 /// true for, actually insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
-                                             bool isSigned) {
+Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
+                                                 bool isSigned) {
  if (Constant *C = dyn_cast<Constant>(V)) {
    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
    // If we got a constantexpr back, try to simplify it with DL info.
@ -229,8 +230,9 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
  return InsertNewInstWith(Res, *I);
 }

-Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
-                                                        const CastInst *CI2) {
+Instruction::CastOps
+InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
+                                       const CastInst *CI2) {
  Type *SrcTy = CI1->getSrcTy();
  Type *MidTy = CI1->getDestTy();
  Type *DstTy = CI2->getDestTy();
@ -257,7 +259,7 @@ Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
 }

 /// Implement the transforms common to all CastInst visitors.
-Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
  Value *Src = CI.getOperand(0);

  // Try to eliminate a cast of a cast.
@ -342,7 +344,7 @@ static bool canNotEvaluateInType(Value *V, Type *Ty) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
                                 Instruction *CxtI) {
  if (canAlwaysEvaluateInType(V, Ty))
    return true;
@ -459,7 +461,8 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
 ///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
 ///   --->
 ///   extractelement <4 x i32> %X, 1
-static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
+                                         InstCombinerImpl &IC) {
  Value *TruncOp = Trunc.getOperand(0);
  Type *DestType = Trunc.getType();
  if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
@ -498,7 +501,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {

 /// Rotate left/right may occur in a wider type than necessary because of type
 /// promotion rules. Try to narrow the inputs and convert to funnel shift.
-Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::narrowRotate(TruncInst &Trunc) {
  assert((isa<VectorType>(Trunc.getSrcTy()) ||
          shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
         "Don't narrow to an illegal scalar type");
@ -582,7 +585,7 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
 /// Try to narrow the width of math or bitwise logic instructions by pulling a
 /// truncate ahead of binary operators.
 /// TODO: Transforms for truncated shifts should be moved into here.
-Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
  Type *SrcTy = Trunc.getSrcTy();
  Type *DestTy = Trunc.getType();
  if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
@ -687,7 +690,7 @@ static Instruction *shrinkInsertElt(CastInst &Trunc,
  return nullptr;
 }

-Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
  if (Instruction *Result = commonCastTransforms(Trunc))
    return Result;

@ -894,8 +897,8 @@ Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
  return nullptr;
 }

-Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
-                                             bool DoTransform) {
+Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
+                                                 bool DoTransform) {
  // If we are just checking for a icmp eq of a single bit and zext'ing it
  // to an integer, then shift the bit to the appropriate place and then
  // cast to integer to avoid the comparison.
@ -1031,7 +1034,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
 ///
 /// This function works on both vectors and scalars.
 static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
-                             InstCombiner &IC, Instruction *CxtI) {
+                             InstCombinerImpl &IC, Instruction *CxtI) {
  BitsToClear = 0;
  if (canAlwaysEvaluateInType(V, Ty))
    return true;
@ -1136,7 +1139,7 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
  }
 }

-Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
  // If this zero extend is only used by a truncate, let the truncate be
  // eliminated before we try to optimize this zext.
  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@ -1274,7 +1277,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
 }

 /// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
-Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
+Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI,
+                                                 Instruction &CI) {
  Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
  ICmpInst::Predicate Pred = ICI->getPredicate();

@ -1410,7 +1414,7 @@ static bool canEvaluateSExtd(Value *V, Type *Ty) {
  return false;
 }

-Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
  // If this sign extend is only used by a truncate, let the truncate be
  // eliminated before we try to optimize this sext.
  if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@ -1497,7 +1501,6 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
  return nullptr;
 }

-
 /// Return a Constant* for the specified floating-point constant if it fits
 /// in the specified FP type without changing its value.
 static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
@ -1616,7 +1619,7 @@ static bool isKnownExactCastIntToFP(CastInst &I) {
  return false;
 }

-Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
+Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
  if (Instruction *I = commonCastTransforms(FPT))
    return I;

@ -1800,7 +1803,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
  return nullptr;
 }

-Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
+Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
  // If the source operand is a cast from integer to FP and known exact, then
  // cast the integer operand directly to the destination type.
  Type *Ty = FPExt.getType();
@ -1818,7 +1821,7 @@ Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
 /// This is safe if the intermediate type has enough bits in its mantissa to
 /// accurately represent all values of X.  For example, this won't work with
 /// i64 -> float -> i64.
-Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
+Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
  if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
    return nullptr;

@ -1858,29 +1861,29 @@ Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
  return replaceInstUsesWith(FI, X);
 }

-Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
  if (Instruction *I = foldItoFPtoI(FI))
    return I;

  return commonCastTransforms(FI);
 }

-Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
  if (Instruction *I = foldItoFPtoI(FI))
    return I;

  return commonCastTransforms(FI);
 }

-Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
  return commonCastTransforms(CI);
 }

-Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
  return commonCastTransforms(CI);
 }

-Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
  // If the source integer type is not the intptr_t type for this target, do a
  // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
  // cast to be exposed to other transforms.
@ -1903,7 +1906,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
 }

 /// Implement the transforms for cast of pointer (bitcast/ptrtoint)
-Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
  Value *Src = CI.getOperand(0);

  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
@ -1925,7 +1928,7 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
  return commonCastTransforms(CI);
 }

-Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
  // If the destination integer type is not the intptr_t type for this target,
  // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
  // to be exposed to other transforms.
@ -1963,9 +1966,9 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
-                                                            VectorType *DestTy,
-                                                            InstCombiner &IC) {
+static Instruction *
+optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
+                                        InstCombinerImpl &IC) {
  // We can only do this optimization if the output is a multiple of the input
  // element size, or the input is a multiple of the output element size.
  // Convert the input type to have the same element type as the output.
@ -2165,7 +2168,7 @@ static bool collectInsertionElements(Value *V, unsigned Shift,
 ///
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
-                                                InstCombiner &IC) {
+                                                InstCombinerImpl &IC) {
  VectorType *DestVecTy = cast<VectorType>(CI.getType());
  Value *IntInput = CI.getOperand(0);

@ -2194,7 +2197,7 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// vectors better than bitcasts of scalars because vector registers are
 /// usually not type-specific like scalar integer or scalar floating-point.
 static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
-                                              InstCombiner &IC) {
+                                              InstCombinerImpl &IC) {
  // TODO: Create and use a pattern matcher for ExtractElementInst.
  auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
  if (!ExtElt || !ExtElt->hasOneUse())
@ -2320,7 +2323,8 @@ static bool hasStoreUsersOnly(CastInst &CI) {
 ///
 /// All the related PHI nodes can be replaced by new PHI nodes with type A.
 /// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
-Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
+Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
+                                                      PHINode *PN) {
  // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
  if (hasStoreUsersOnly(CI))
    return nullptr;
@ -2484,7 +2488,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
  return RetVal;
 }

-Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
  // If the operands are integer typed then apply the integer transforms,
  // otherwise just apply the common ones.
  Value *Src = CI.getOperand(0);
@ -2666,7 +2670,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
  return commonCastTransforms(CI);
 }

-Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
  // If the destination pointer element type is not the same as the source's
  // first do a bitcast to the destination type, and then the addrspacecast.
  // This allows the cast to be exposed to other transforms.
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@ -24,6 +24,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"

 using namespace llvm;
 using namespace PatternMatch;
@ -142,10 +143,10 @@ static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
 ///
 /// If AndCst is non-null, then the loaded value is masked with that constant
 /// before doing the comparison. This handles cases like "A[i]&4 == 0".
-Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
-                                                        GlobalVariable *GV,
-                                                        CmpInst &ICI,
-                                                        ConstantInt *AndCst) {
+Instruction *
+InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                               GlobalVariable *GV, CmpInst &ICI,
+                                               ConstantInt *AndCst) {
  Constant *Init = GV->getInitializer();
  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
    return nullptr;
@ -422,7 +423,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
 ///
-static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
+static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
                                          const DataLayout &DL) {
  gep_type_iterator GTI = gep_type_begin(GEP);

@ -841,9 +842,9 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,

 /// Fold comparisons between a GEP instruction and something else. At this point
 /// we know that the GEP is on the LHS of the comparison.
-Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
-                                       ICmpInst::Predicate Cond,
-                                       Instruction &I) {
+Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                                           ICmpInst::Predicate Cond,
+                                           Instruction &I) {
  // Don't transform signed compares of GEPs into index compares. Even if the
  // GEP is inbounds, the final add of the base pointer can have signed overflow
  // and would change the result of the icmp.
@ -1021,9 +1022,9 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
  return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
 }

-Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
-                                         const AllocaInst *Alloca,
-                                         const Value *Other) {
+Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
+                                             const AllocaInst *Alloca,
+                                             const Value *Other) {
  assert(ICI.isEquality() && "Cannot fold non-equality comparison.");

  // It would be tempting to fold away comparisons between allocas and any
@ -1099,8 +1100,8 @@ Instruction *InstCombiner::foldAllocaCmp(ICmpInst &ICI,
 }

 /// Fold "icmp pred (X+C), X".
-Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C,
-                                              ICmpInst::Predicate Pred) {
+Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
+                                                  ICmpInst::Predicate Pred) {
  // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
  // so the values can never be equal.  Similarly for all other "or equals"
  // operators.
@ -1149,9 +1150,9 @@ Instruction *InstCombiner::foldICmpAddOpConst(Value *X, const APInt &C,
 /// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
 /// (icmp eq/ne A, Log2(AP2/AP1)) ->
 /// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
-Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
-                                                 const APInt &AP1,
-                                                 const APInt &AP2) {
+Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
+                                                     const APInt &AP1,
+                                                     const APInt &AP2) {
  assert(I.isEquality() && "Cannot fold icmp gt/lt");

  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@ -1208,9 +1209,9 @@ Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,

 /// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
 /// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
-Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
-                                                 const APInt &AP1,
-                                                 const APInt &AP2) {
+Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A,
+                                                     const APInt &AP1,
+                                                     const APInt &AP2) {
  assert(I.isEquality() && "Cannot fold icmp gt/lt");

  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
@ -1254,7 +1255,7 @@ Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
 ///
 static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
                                          ConstantInt *CI2, ConstantInt *CI1,
-                                          InstCombiner &IC) {
+                                          InstCombinerImpl &IC) {
  // The transformation we're trying to do here is to transform this into an
  // llvm.sadd.with.overflow.  To do this, we have to replace the original add
  // with a narrower add, and discard the add-with-constant that is part of the
@ -1340,7 +1341,7 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
 ///   icmp eq/ne (urem/srem %x, %y), 0
 /// iff %y is a power-of-two, we can replace this with a bit test:
 ///   icmp eq/ne (and %x, (add %y, -1)), 0
-Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
  // This fold is only valid for equality predicates.
  if (!I.isEquality())
    return nullptr;
@ -1359,7 +1360,7 @@ Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {

 /// Fold equality-comparison between zero and any (maybe truncated) right-shift
 /// by one-less-than-bitwidth into a sign test on the original value.
-Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) {
  Instruction *Val;
  ICmpInst::Predicate Pred;
  if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
@ -1390,7 +1391,7 @@ Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) {
 }

 // Handle  icmp pred X, 0
-Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
  CmpInst::Predicate Pred = Cmp.getPredicate();
  if (!match(Cmp.getOperand(1), m_Zero()))
    return nullptr;
@ -1431,7 +1432,7 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) {
 /// should be moved to some other helper and extended as noted below (it is also
 /// possible that code has been made unnecessary - do we canonicalize IR to
 /// overflow/saturating intrinsics or not?).
-Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {
  // Match the following pattern, which is a common idiom when writing
  // overflow-safe integer arithmetic functions. The source performs an addition
  // in wider type and explicitly checks for overflow using comparisons against
@ -1477,7 +1478,7 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
 }

 /// Canonicalize icmp instructions based on dominating conditions.
-Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
  // This is a cheap/incomplete check for dominance - just match a single
  // predecessor with a conditional branch.
  BasicBlock *CmpBB = Cmp.getParent();
@ -1547,9 +1548,9 @@ Instruction *InstCombiner::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
 }

 /// Fold icmp (trunc X, Y), C.
-Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
-                                                 TruncInst *Trunc,
-                                                 const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
+                                                     TruncInst *Trunc,
+                                                     const APInt &C) {
  ICmpInst::Predicate Pred = Cmp.getPredicate();
  Value *X = Trunc->getOperand(0);
  if (C.isOneValue() && C.getBitWidth() > 1) {
@ -1580,9 +1581,9 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
 }

 /// Fold icmp (xor X, Y), C.
-Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Xor,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpXorConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Xor,
+                                                   const APInt &C) {
  Value *X = Xor->getOperand(0);
  Value *Y = Xor->getOperand(1);
  const APInt *XorC;
@ -1649,8 +1650,10 @@ Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
 }

 /// Fold icmp (and (sh X, Y), C2), C1.
-Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
-                                            const APInt &C1, const APInt &C2) {
+Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
+                                                BinaryOperator *And,
+                                                const APInt &C1,
+                                                const APInt &C2) {
  BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
  if (!Shift || !Shift->isShift())
    return nullptr;
@ -1733,9 +1736,9 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
 }

 /// Fold icmp (and X, C2), C1.
-Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
-                                                 BinaryOperator *And,
-                                                 const APInt &C1) {
+Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
+                                                     BinaryOperator *And,
+                                                     const APInt &C1) {
  bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;

  // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
@ -1841,9 +1844,9 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
 }

 /// Fold icmp (and X, Y), C.
-Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
-                                               BinaryOperator *And,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *And,
+                                                   const APInt &C) {
  if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
    return I;

@ -1895,8 +1898,9 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
 }

 /// Fold icmp (or X, Y), C.
-Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
-                                              const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
+                                                  BinaryOperator *Or,
+                                                  const APInt &C) {
  ICmpInst::Predicate Pred = Cmp.getPredicate();
  if (C.isOneValue()) {
    // icmp slt signum(V) 1 --> icmp slt V, 1
@ -1960,9 +1964,9 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
 }

 /// Fold icmp (mul X, Y), C.
-Instruction *InstCombiner::foldICmpMulConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Mul,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Mul,
+                                                   const APInt &C) {
  const APInt *MulC;
  if (!match(Mul->getOperand(1), m_APInt(MulC)))
    return nullptr;
@ -2043,9 +2047,9 @@ static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
 }

 /// Fold icmp (shl X, Y), C.
-Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Shl,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Shl,
+                                                   const APInt &C) {
  const APInt *ShiftVal;
  if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
    return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
@ -2183,9 +2187,9 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
 }

 /// Fold icmp ({al}shr X, Y), C.
-Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Shr,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Shr,
+                                                   const APInt &C) {
  // An exact shr only shifts out zero bits, so:
  // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
  Value *X = Shr->getOperand(0);
@ -2276,9 +2280,9 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
  return nullptr;
 }

-Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
-                                                BinaryOperator *SRem,
-                                                const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
+                                                    BinaryOperator *SRem,
+                                                    const APInt &C) {
  // Match an 'is positive' or 'is negative' comparison of remainder by a
  // constant power-of-2 value:
  // (X % pow2C) sgt/slt 0
@ -2315,9 +2319,9 @@ Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp,
 }

 /// Fold icmp (udiv X, Y), C.
-Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
-                                                BinaryOperator *UDiv,
-                                                const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
+                                                    BinaryOperator *UDiv,
+                                                    const APInt &C) {
  const APInt *C2;
  if (!match(UDiv->getOperand(0), m_APInt(C2)))
    return nullptr;
@ -2344,9 +2348,9 @@ Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
 }

 /// Fold icmp ({su}div X, Y), C.
-Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Div,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Div,
+                                                   const APInt &C) {
  // Fold: icmp pred ([us]div X, C2), C -> range test
  // Fold this div into the comparison, producing a range check.
  // Determine, based on the divide type, what the range is being
@ -2514,9 +2518,9 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
 }

 /// Fold icmp (sub X, Y), C.
-Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Sub,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Sub,
+                                                   const APInt &C) {
  Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
  ICmpInst::Predicate Pred = Cmp.getPredicate();
  const APInt *C2;
@ -2576,9 +2580,9 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
 }

 /// Fold icmp (add X, Y), C.
-Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
-                                               BinaryOperator *Add,
-                                               const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
+                                                   BinaryOperator *Add,
+                                                   const APInt &C) {
  Value *Y = Add->getOperand(1);
  const APInt *C2;
  if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
@ -2642,10 +2646,10 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
  return nullptr;
 }

-bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
-                                           Value *&RHS, ConstantInt *&Less,
-                                           ConstantInt *&Equal,
-                                           ConstantInt *&Greater) {
+bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
+                                               Value *&RHS, ConstantInt *&Less,
+                                               ConstantInt *&Equal,
+                                               ConstantInt *&Greater) {
  // TODO: Generalize this to work with other comparison idioms or ensure
  // they get canonicalized into this form.

@ -2682,7 +2686,8 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
  if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
    // x sgt C-1  <-->  x sge C  <-->  not(x slt C)
    auto FlippedStrictness =
-        getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2));
+        InstCombiner::getFlippedStrictnessPredicateAndConstant(
+            PredB, cast<Constant>(RHS2));
    if (!FlippedStrictness)
      return false;
    assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
@ -2694,9 +2699,9 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
  return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
 }

-Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
-                                                  SelectInst *Select,
-                                                  ConstantInt *C) {
+Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
+                                                      SelectInst *Select,
+                                                      ConstantInt *C) {

  assert(C && "Cmp RHS should be a constant int!");
  // If we're testing a constant value against the result of a three way
@ -2794,7 +2799,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
    const APInt *C;
    bool TrueIfSigned;
    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
-        isSignBitCheck(Pred, *C, TrueIfSigned)) {
+        InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) {
      if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
          match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
        // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
@ -2870,7 +2875,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,

 /// Try to fold integer comparisons with a constant operand: icmp Pred X, C
 /// where X is some kind of instruction.
-Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
+Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) {
  const APInt *C;
  if (!match(Cmp.getOperand(1), m_APInt(C)))
    return nullptr;
@ -2955,9 +2960,8 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {

 /// Fold an icmp equality instruction with binary operator LHS and constant RHS:
 /// icmp eq/ne BO, C.
-Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
-                                                             BinaryOperator *BO,
-                                                             const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
+    ICmpInst &Cmp, BinaryOperator *BO, const APInt &C) {
  // TODO: Some of these folds could work with arbitrary constants, but this
  // function is limited to scalar and vector splat constants.
  if (!Cmp.isEquality())
@ -3072,9 +3076,8 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
 }

 /// Fold an equality icmp with LLVM intrinsic and constant operand.
-Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
-                                                           IntrinsicInst *II,
-                                                           const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
+    ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) {
  Type *Ty = II->getType();
  unsigned BitWidth = C.getBitWidth();
  switch (II->getIntrinsicID()) {
@ -3145,9 +3148,9 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
 }

 /// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
-Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
-                                                         IntrinsicInst *II,
-                                                         const APInt &C) {
+Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
+                                                             IntrinsicInst *II,
+                                                             const APInt &C) {
  if (Cmp.isEquality())
    return foldICmpEqIntrinsicWithConstant(Cmp, II, C);

@ -3204,7 +3207,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
 }

 /// Handle icmp with constant (but not simple integer constant) RHS.
-Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
  Constant *RHSC = dyn_cast<Constant>(Op1);
  Instruction *LHSI = dyn_cast<Instruction>(Op0);
@ -3650,7 +3653,7 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
 ///   @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
 /// Note that the comparison is commutative, while inverted (u>=, ==) predicate
 /// will mean that we are looking for the opposite answer.
-Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
+Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
  ICmpInst::Predicate Pred;
  Value *X, *Y;
  Instruction *Mul;
@ -3716,7 +3719,8 @@ Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
 /// TODO: A large part of this logic is duplicated in InstSimplify's
 /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
 /// duplication.
-Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) {
+Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
+                                             const SimplifyQuery &SQ) {
  const SimplifyQuery Q = SQ.getWithInstruction(&I);
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);

@ -4170,7 +4174,7 @@ static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
  return nullptr;
 }

-Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
  if (!I.isEquality())
    return nullptr;

@ -4438,7 +4442,7 @@ static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
 }

 /// Handle icmp (cast x), (cast or constant).
-Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) {
+Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
  auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
  if (!CastOp0)
    return nullptr;
@ -4493,9 +4497,10 @@ static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
  }
 }

-OverflowResult InstCombiner::computeOverflow(
-    Instruction::BinaryOps BinaryOp, bool IsSigned,
-    Value *LHS, Value *RHS, Instruction *CxtI) const {
+OverflowResult
+InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp,
+                                  bool IsSigned, Value *LHS, Value *RHS,
+                                  Instruction *CxtI) const {
  switch (BinaryOp) {
    default:
      llvm_unreachable("Unsupported binary op");
@ -4517,9 +4522,11 @@ OverflowResult InstCombiner::computeOverflow(
  }
 }

-bool InstCombiner::OptimizeOverflowCheck(
-    Instruction::BinaryOps BinaryOp, bool IsSigned, Value *LHS, Value *RHS,
-    Instruction &OrigI, Value *&Result, Constant *&Overflow) {
+bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp,
+                                             bool IsSigned, Value *LHS,
+                                             Value *RHS, Instruction &OrigI,
+                                             Value *&Result,
+                                             Constant *&Overflow) {
  if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
    std::swap(LHS, RHS);

@ -4575,7 +4582,8 @@ bool InstCombiner::OptimizeOverflowCheck(
 /// \returns Instruction which must replace the compare instruction, NULL if no
 ///          replacement required.
 static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
-                                         Value *OtherVal, InstCombiner &IC) {
+                                         Value *OtherVal,
+                                         InstCombinerImpl &IC) {
  // Don't bother doing this transformation for pointers, don't do it for
  // vectors.
  if (!isa<IntegerType>(MulVal->getType()))
@ -4723,7 +4731,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
  Function *F = Intrinsic::getDeclaration(
      I.getModule(), Intrinsic::umul_with_overflow, MulType);
  CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
-  IC.Worklist.push(MulInstr);
+  IC.addToWorklist(MulInstr);

  // If there are uses of mul result other than the comparison, we know that
  // they are truncation or binary AND. Change them to use result of
@ -4750,11 +4758,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
      } else {
        llvm_unreachable("Unexpected Binary operation");
      }
-      IC.Worklist.push(cast<Instruction>(U));
+      IC.addToWorklist(cast<Instruction>(U));
    }
  }
  if (isa<Instruction>(OtherVal))
-    IC.Worklist.push(cast<Instruction>(OtherVal));
+    IC.addToWorklist(cast<Instruction>(OtherVal));

  // The original icmp gets replaced with the overflow value, maybe inverted
  // depending on predicate.
@ -4799,7 +4807,7 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
  // If this is a normal comparison, it demands all bits. If it is a sign bit
  // comparison, it only demands the sign bit.
  bool UnusedBit;
-  if (isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
+  if (InstCombiner::isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
    return APInt::getSignMask(BitWidth);

  switch (I.getPredicate()) {
@ -4856,9 +4864,9 @@ static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
 /// \return true when \p UI is the only use of \p DI in the parent block
 /// and all other uses of \p DI are in blocks dominated by \p DB.
 ///
-bool InstCombiner::dominatesAllUses(const Instruction *DI,
-                                    const Instruction *UI,
-                                    const BasicBlock *DB) const {
+bool InstCombinerImpl::dominatesAllUses(const Instruction *DI,
+                                        const Instruction *UI,
+                                        const BasicBlock *DB) const {
  assert(DI && UI && "Instruction not defined\n");
  // Ignore incomplete definitions.
  if (!DI->getParent())
@ -4931,9 +4939,9 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
 /// major restriction since a NE compare should be 'normalized' to an equal
 /// compare, which usually happens in the combiner and test case
 /// select-cmp-br.ll checks for it.
-bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
-                                             const ICmpInst *Icmp,
-                                             const unsigned SIOpd) {
+bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
+                                                 const ICmpInst *Icmp,
+                                                 const unsigned SIOpd) {
  assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
  if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
    BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
@ -4959,7 +4967,7 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,

 /// Try to fold the comparison based on range information we can get by checking
 /// whether bits are known to be zero or one in the inputs.
-Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
+Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
  Type *Ty = Op0->getType();
  ICmpInst::Predicate Pred = I.getPredicate();
@ -5186,8 +5194,8 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
 }

 llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
-llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
-                                               Constant *C) {
+InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
+                                                       Constant *C) {
  assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
         "Only for relational integer predicates.");

@ -5256,7 +5264,7 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
 static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
  ICmpInst::Predicate Pred = I.getPredicate();
  if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
-      isCanonicalPredicate(Pred))
+      InstCombiner::isCanonicalPredicate(Pred))
    return nullptr;

  Value *Op0 = I.getOperand(0);
@ -5265,7 +5273,8 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
  if (!Op1C)
    return nullptr;

-  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
+  auto FlippedStrictness =
+      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
  if (!FlippedStrictness)
    return nullptr;

@ -5277,11 +5286,11 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
 static CmpInst *canonicalizeICmpPredicate(CmpInst &I) {
  // Is the predicate already canonical?
  CmpInst::Predicate Pred = I.getPredicate();
-  if (isCanonicalPredicate(Pred))
+  if (InstCombiner::isCanonicalPredicate(Pred))
    return nullptr;

  // Can all users be adjusted to predicate inversion?
-  if (!canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
+  if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
    return nullptr;

  // Ok, we can canonicalize comparison!
@ -5510,7 +5519,7 @@ static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
  return ExtractValueInst::Create(UAddOv, 1);
 }

-Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
  bool Changed = false;
  const SimplifyQuery Q = SQ.getWithInstruction(&I);
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@ -5748,8 +5757,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
 }

 /// Fold fcmp ([us]itofp x, cst) if possible.
-Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
-                                                Constant *RHSC) {
+Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
+                                                    Instruction *LHSI,
+                                                    Constant *RHSC) {
  if (!isa<ConstantFP>(RHSC)) return nullptr;
  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();

@ -6034,7 +6044,7 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
 }

 /// Optimize fabs(X) compared with zero.
-static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) {
+static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
  Value *X;
  if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
      !match(I.getOperand(1), m_PosZeroFP()))
@ -6096,7 +6106,7 @@ static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) {
  }
 }

-Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
  bool Changed = false;

  /// Orders the operands of the compare so that they are listed from most
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@ -15,40 +15,32 @@
 #ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H

-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
-#include <cstdint>

 #define DEBUG_TYPE "instcombine"

 using namespace llvm::PatternMatch;

+// As a default, let's assume that we want to be aggressive,
+// and attempt to traverse with no limits in attempt to sink negation.
+static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
+
+// Let's guesstimate that most often we will end up visiting/producing
+// fairly small number of new instructions.
+static constexpr unsigned NegatorMaxNodesSSO = 16;
+
 namespace llvm {

 class AAResults;
@ -65,305 +57,26 @@ class ProfileSummaryInfo;
 class TargetLibraryInfo;
 class User;

-/// Assign a complexity or rank value to LLVM Values. This is used to reduce
-/// the amount of pattern matching needed for compares and commutative
-/// instructions. For example, if we have:
-///   icmp ugt X, Constant
-/// or
-///   xor (add X, Constant), cast Z
-///
-/// We do not have to consider the commuted variants of these patterns because
-/// canonicalization based on complexity guarantees the above ordering.
-///
-/// This routine maps IR values to various complexity ranks:
-///   0 -> undef
-///   1 -> Constants
-///   2 -> Other non-instructions
-///   3 -> Arguments
-///   4 -> Cast and (f)neg/not instructions
-///   5 -> Other instructions
-static inline unsigned getComplexity(Value *V) {
-  if (isa<Instruction>(V)) {
-    if (isa<CastInst>(V) || match(V, m_Neg(m_Value())) ||
-        match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value())))
-      return 4;
-    return 5;
-  }
-  if (isa<Argument>(V))
-    return 3;
-  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
-}
-
-/// Predicate canonicalization reduces the number of patterns that need to be
-/// matched by other transforms. For example, we may swap the operands of a
-/// conditional branch or select to create a compare with a canonical (inverted)
-/// predicate which is then more likely to be matched with other values.
-static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
-  switch (Pred) {
-  case CmpInst::ICMP_NE:
-  case CmpInst::ICMP_ULE:
-  case CmpInst::ICMP_SLE:
-  case CmpInst::ICMP_UGE:
-  case CmpInst::ICMP_SGE:
-  // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
-  case CmpInst::FCMP_ONE:
-  case CmpInst::FCMP_OLE:
-  case CmpInst::FCMP_OGE:
-    return false;
-  default:
-    return true;
-  }
-}
-
-/// Given an exploded icmp instruction, return true if the comparison only
-/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the
-/// result of the comparison is true when the input value is signed.
-inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
-                           bool &TrueIfSigned) {
-  switch (Pred) {
-  case ICmpInst::ICMP_SLT: // True if LHS s< 0
-    TrueIfSigned = true;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_SLE: // True if LHS s<= -1
-    TrueIfSigned = true;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGT: // True if LHS s> -1
-    TrueIfSigned = false;
-    return RHS.isAllOnesValue();
-  case ICmpInst::ICMP_SGE: // True if LHS s>= 0
-    TrueIfSigned = false;
-    return RHS.isNullValue();
-  case ICmpInst::ICMP_UGT:
-    // True if LHS u> RHS and RHS == sign-bit-mask - 1
-    TrueIfSigned = true;
-    return RHS.isMaxSignedValue();
-  case ICmpInst::ICMP_UGE:
-    // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = true;
-    return RHS.isMinSignedValue();
-  case ICmpInst::ICMP_ULT:
-    // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc)
-    TrueIfSigned = false;
-    return RHS.isMinSignedValue();
-  case ICmpInst::ICMP_ULE:
-    // True if LHS u<= RHS and RHS == sign-bit-mask - 1
-    TrueIfSigned = false;
-    return RHS.isMaxSignedValue();
-  default:
-    return false;
-  }
-}
-
-llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
-getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C);
-
-/// Return the source operand of a potentially bitcasted value while optionally
-/// checking if it has one use. If there is no bitcast or the one use check is
-/// not met, return the input value itself.
-static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
-  if (auto *BitCast = dyn_cast<BitCastInst>(V))
-    if (!OneUseOnly || BitCast->hasOneUse())
-      return BitCast->getOperand(0);
-
-  // V is not a bitcast or V has more than one use and OneUseOnly is true.
-  return V;
-}
-
-/// Add one to a Constant
-static inline Constant *AddOne(Constant *C) {
-  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// Subtract one from a Constant
-static inline Constant *SubOne(Constant *C) {
-  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
-}
-
-/// Return true if the specified value is free to invert (apply ~ to).
-/// This happens in cases where the ~ can be eliminated.  If WillInvertAllUses
-/// is true, work under the assumption that the caller intends to remove all
-/// uses of V and only keep uses of ~V.
-///
-/// See also: canFreelyInvertAllUsersOf()
-static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
-  // ~(~(X)) -> X.
-  if (match(V, m_Not(m_Value())))
-    return true;
-
-  // Constants can be considered to be not'ed values.
-  if (match(V, m_AnyIntegralConstant()))
-    return true;
-
-  // Compares can be inverted if all of their uses are being modified to use the
-  // ~V.
-  if (isa<CmpInst>(V))
-    return WillInvertAllUses;
-
-  // If `V` is of the form `A + Constant` then `-1 - V` can be folded into `(-1
-  // - Constant) - A` if we are willing to invert all of the uses.
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
-    if (BO->getOpcode() == Instruction::Add ||
-        BO->getOpcode() == Instruction::Sub)
-      if (isa<Constant>(BO->getOperand(0)) || isa<Constant>(BO->getOperand(1)))
-        return WillInvertAllUses;
-
-  // Selects with invertible operands are freely invertible
-  if (match(V, m_Select(m_Value(), m_Not(m_Value()), m_Not(m_Value()))))
-    return WillInvertAllUses;
-
-  return false;
-}
-
-/// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
-/// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this fn.
-///
-/// See also: isFreeToInvert()
-static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
-  // Look at every user of V.
-  for (Use &U : V->uses()) {
-    if (U.getUser() == IgnoredUser)
-      continue; // Don't consider this user.
-
-    auto *I = cast<Instruction>(U.getUser());
-    switch (I->getOpcode()) {
-    case Instruction::Select:
-      if (U.getOperandNo() != 0) // Only if the value is used as select cond.
-        return false;
-      break;
-    case Instruction::Br:
-      assert(U.getOperandNo() == 0 && "Must be branching on that value.");
-      break; // Free to invert by swapping true/false values/destinations.
-    case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it.
-      if (!match(I, m_Not(m_Value())))
-        return false; // Not a 'not'.
-      break;
-    default:
-      return false; // Don't know, likely not freely invertible.
-    }
-    // So far all users were free to invert...
-  }
-  return true; // Can freely invert all users!
-}
-
-/// Some binary operators require special handling to avoid poison and undefined
-/// behavior. If a constant vector has undef elements, replace those undefs with
-/// identity constants if possible because those are always safe to execute.
-/// If no identity constant exists, replace undef with some other safe constant.
-static inline Constant *getSafeVectorConstantForBinop(
-      BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) {
-  auto *InVTy = dyn_cast<VectorType>(In->getType());
-  assert(InVTy && "Not expecting scalars here");
-
-  Type *EltTy = InVTy->getElementType();
-  auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
-  if (!SafeC) {
-    // TODO: Should this be available as a constant utility function? It is
-    // similar to getBinOpAbsorber().
-    if (IsRHSConstant) {
-      switch (Opcode) {
-      case Instruction::SRem: // X % 1 = 0
-      case Instruction::URem: // X %u 1 = 0
-        SafeC = ConstantInt::get(EltTy, 1);
-        break;
-      case Instruction::FRem: // X % 1.0 (doesn't simplify, but it is safe)
-        SafeC = ConstantFP::get(EltTy, 1.0);
-        break;
-      default:
-        llvm_unreachable("Only rem opcodes have no identity constant for RHS");
-      }
-    } else {
-      switch (Opcode) {
-      case Instruction::Shl:  // 0 << X = 0
-      case Instruction::LShr: // 0 >>u X = 0
-      case Instruction::AShr: // 0 >> X = 0
-      case Instruction::SDiv: // 0 / X = 0
-      case Instruction::UDiv: // 0 /u X = 0
-      case Instruction::SRem: // 0 % X = 0
-      case Instruction::URem: // 0 %u X = 0
-      case Instruction::Sub:  // 0 - X (doesn't simplify, but it is safe)
-      case Instruction::FSub: // 0.0 - X (doesn't simplify, but it is safe)
-      case Instruction::FDiv: // 0.0 / X (doesn't simplify, but it is safe)
-      case Instruction::FRem: // 0.0 % X = 0
-        SafeC = Constant::getNullValue(EltTy);
-        break;
-      default:
-        llvm_unreachable("Expected to find identity constant for opcode");
-      }
-    }
-  }
-  assert(SafeC && "Must have safe constant for binop");
-  unsigned NumElts = InVTy->getNumElements();
-  SmallVector<Constant *, 16> Out(NumElts);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *C = In->getAggregateElement(i);
-    Out[i] = isa<UndefValue>(C) ? SafeC : C;
-  }
-  return ConstantVector::get(Out);
-}
-
-/// The core instruction combiner logic.
-///
-/// This class provides both the logic to recursively visit instructions and
-/// combine them.
-class LLVM_LIBRARY_VISIBILITY InstCombiner
-    : public InstVisitor<InstCombiner, Instruction *> {
-  // FIXME: These members shouldn't be public.
+class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
+    : public InstCombiner,
+      public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
-  /// A worklist of the instructions that need to be simplified.
-  InstCombineWorklist &Worklist;
+  InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder,
+                   bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+                   TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+                   DominatorTree &DT, OptimizationRemarkEmitter &ORE,
+                   BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+                   const DataLayout &DL, LoopInfo *LI)
+      : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
+                     BFI, PSI, DL, LI) {}

-  /// An IRBuilder that automatically inserts new instructions into the
-  /// worklist.
-  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
-  BuilderTy &Builder;
-
-private:
-  // Mode in which we are running the combiner.
-  const bool MinimizeSize;
-
-  AAResults *AA;
-
-  // Required analyses.
-  AssumptionCache &AC;
-  TargetLibraryInfo &TLI;
-  DominatorTree &DT;
-  const DataLayout &DL;
-  const SimplifyQuery SQ;
-  OptimizationRemarkEmitter &ORE;
-  BlockFrequencyInfo *BFI;
-  ProfileSummaryInfo *PSI;
-
-  // Optional analyses. When non-null, these can both be used to do better
-  // combining and will be updated to reflect any changes.
-  LoopInfo *LI;
-
-  bool MadeIRChange = false;
-
-public:
-  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, AAResults *AA,
-               AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
-               OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-               ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
-      : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        AA(AA), AC(AC), TLI(TLI), DT(DT),
-        DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
+  virtual ~InstCombinerImpl() {}

  /// Run the combiner over the entire worklist until it is empty.
  ///
  /// \returns true if the IR is changed.
  bool run();

-  AssumptionCache &getAssumptionCache() const { return AC; }
-
-  const DataLayout &getDataLayout() const { return DL; }
-
-  DominatorTree &getDominatorTree() const { return DT; }
-
-  LoopInfo *getLoopInfo() const { return LI; }
-
-  TargetLibraryInfo &getTargetLibraryInfo() const { return TLI; }
-
  // Visitation implementation - Implement instruction combining for different
  // instruction types.  The semantics are as follows:
  // Return Value:
@ -726,7 +439,7 @@ public:
  /// When dealing with an instruction that has side effects or produces a void
  /// value, we can't rely on DCE to delete the instruction. Instead, visit
  /// methods should return the value returned by this function.
-  Instruction *eraseInstFromFunction(Instruction &I) {
+  Instruction *eraseInstFromFunction(Instruction &I) override {
    LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
    assert(I.use_empty() && "Cannot erase instruction that is used!");
    salvageDebugInfo(I);
@ -808,10 +521,6 @@ public:
      Instruction::BinaryOps BinaryOp, bool IsSigned,
      Value *LHS, Value *RHS, Instruction *CxtI) const;

-  /// Maximum size of array considered when transforming.
-  uint64_t MaxArraySizeForCombine = 0;
-
-private:
  /// Performs a few simplifications for operators which are associative
  /// or commutative.
  bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
@ -857,7 +566,7 @@ private:
                                 unsigned Depth, Instruction *CxtI);
  bool SimplifyDemandedBits(Instruction *I, unsigned Op,
                            const APInt &DemandedMask, KnownBits &Known,
-                            unsigned Depth = 0);
+                            unsigned Depth = 0) override;

  /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
  /// bits. It also tries to handle simplifications that can be done based on
@ -877,13 +586,10 @@ private:
  /// demanded bits.
  bool SimplifyDemandedInstructionBits(Instruction &Inst);

-  Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
-                                               APInt DemandedElts,
-                                               int DmaskIdx = -1);
-
-  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                    APInt &UndefElts, unsigned Depth = 0,
-                                    bool AllowMultipleUsers = false);
+  virtual Value *
+  SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
+                             unsigned Depth = 0,
+                             bool AllowMultipleUsers = false) override;

  /// Canonicalize the position of binops relative to shufflevector.
  Instruction *foldVectorBinop(BinaryOperator &Inst);
@ -1023,18 +729,6 @@ private:
  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };

-namespace {
-
-// As a default, let's assume that we want to be aggressive,
-// and attempt to traverse with no limits in attempt to sink negation.
-static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
-
-// Let's guesstimate that most often we will end up visiting/producing
-// fairly small number of new instructions.
-static constexpr unsigned NegatorMaxNodesSSO = 16;
-
-} // namespace
-
 class Negator final {
  /// Top-to-bottom, def-to-use negated instruction tree we produced.
  SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
@ -1078,7 +772,7 @@ public:
  /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
  /// otherwise returns negated value.
  LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
-                                      InstCombiner &IC);
+                                      InstCombinerImpl &IC);
 };

 } // end namespace llvm
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@ -23,6 +23,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@ -166,7 +167,8 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
                                            APInt(64, AllocaSize), DL);
 }

-static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
+static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
+                                            AllocaInst &AI) {
  // Check for array size of 1 (scalar allocation).
  if (!AI.isArrayAllocation()) {
    // i32 1 is the canonical array size for scalar allocations.
@ -234,7 +236,7 @@ namespace {
 // instruction.
 class PointerReplacer {
 public:
-  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
  void replacePointer(Instruction &I, Value *V);

 private:
@ -244,7 +246,7 @@ private:

  SmallVector<Instruction *, 4> Path;
  MapVector<Value *, Value *> WorkMap;
-  InstCombiner &IC;
+  InstCombinerImpl &IC;
 };
 } // end anonymous namespace

@ -323,7 +325,7 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) {
  findLoadAndReplace(I);
 }

-Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
  if (auto *I = simplifyAllocaArraySize(*this, AI))
    return I;

@ -421,9 +423,9 @@ static bool isSupportedAtomicType(Type *Ty) {
 /// that pointer type, load it, etc.
 ///
 /// Note that this will create all of the instructions with whatever insert
-/// point the \c InstCombiner currently is using.
-LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
-                                             const Twine &Suffix) {
+/// point the \c InstCombinerImpl currently is using.
+LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
+                                                 const Twine &Suffix) {
  assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
         "can't fold an atomic load to requested type");

@ -445,7 +447,8 @@ LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
 /// Combine a store to a new type.
 ///
 /// Returns the newly created store instruction.
-static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value *V) {
+static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
+                                         Value *V) {
  assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
         "can't fold an atomic store of requested type");

@ -502,7 +505,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
  assert(V->getType()->isPointerTy() && "Expected pointer type.");
  // Ignore possible ty* to ixx* bitcast.
-  V = peekThroughBitcast(V);
+  V = InstCombiner::peekThroughBitcast(V);
  // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
  // pattern.
  CmpInst::Predicate Pred;
@ -537,7 +540,8 @@ static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
 /// or a volatile load. This is debatable, and might be reasonable to change
 /// later. However, it is risky in case some backend or other part of LLVM is
 /// relying on the exact type loaded to select appropriate atomic operations.
-static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
+static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
+                                               LoadInst &LI) {
  // FIXME: We could probably with some care handle both volatile and ordered
  // atomic loads here but it isn't clear that this is important.
  if (!LI.isUnordered())
@ -563,9 +567,9 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
  if (!Ty->isIntegerTy() && Ty->isSized() && !isa<ScalableVectorType>(Ty) &&
      DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
      DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) &&
-      !isMinMaxWithLoads(
-          peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true),
-          Dummy)) {
+      !isMinMaxWithLoads(InstCombiner::peekThroughBitcast(
+                             LI.getPointerOperand(), /*OneUseOnly=*/true),
+                         Dummy)) {
    if (all_of(LI.users(), [&LI](User *U) {
          auto *SI = dyn_cast<StoreInst>(U);
          return SI && SI->getPointerOperand() != &LI &&
@ -605,7 +609,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
  return nullptr;
 }

-static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
+static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
  // FIXME: We could probably with some care handle both volatile and atomic
  // stores here but it isn't clear that this is important.
  if (!LI.isSimple())
@ -804,8 +808,9 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
 // not zero. Currently, we only handle the first such index. Also, we could
 // also search through non-zero constant indices if we kept track of the
 // offsets those indices implied.
-static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
-                                     Instruction *MemI, unsigned &Idx) {
+static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
+                                     GetElementPtrInst *GEPI, Instruction *MemI,
+                                     unsigned &Idx) {
  if (GEPI->getNumOperands() < 2)
    return false;

@ -874,7 +879,7 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
 // access, but the object has only one element, we can assume that the index
 // will always be zero. If we replace the GEP, return it.
 template <typename T>
-static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
+static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
                                          T &MemI) {
  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
    unsigned Idx;
@ -916,7 +921,7 @@ static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
  return false;
 }

-Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
  Value *Op = LI.getOperand(0);

  // Try to canonicalize the loaded type.
@ -1033,7 +1038,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
 /// Note that %U may contain non-undef values where %V1 has undef.
-static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
+static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
  Value *U = nullptr;
  while (auto *IV = dyn_cast<InsertValueInst>(V)) {
    auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
@ -1094,7 +1099,7 @@ static Value *likeBitCastFromVector(InstCombiner &IC, Value *V) {
 /// the caller must erase the store instruction. We have to let the caller erase
 /// the store instruction as otherwise there is no way to signal whether it was
 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
-static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
+static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
  // FIXME: We could probably with some care handle both volatile and ordered
  // atomic stores here but it isn't clear that this is important.
  if (!SI.isUnordered())
@ -1126,7 +1131,7 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
  return false;
 }

-static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
+static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
  // FIXME: We could probably with some care handle both volatile and atomic
  // stores here but it isn't clear that this is important.
  if (!SI.isSimple())
@ -1266,7 +1271,7 @@ static bool equivalentAddressValues(Value *A, Value *B) {
 /// Converts store (bitcast (load (bitcast (select ...)))) to
 /// store (load (select ...)), where select is minmax:
 /// select ((cmp load V1, load V2), V1, V2).
-static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
+static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
                                                StoreInst &SI) {
  // bitcast?
  if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
@ -1296,7 +1301,8 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
  if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
        auto *SI = dyn_cast<StoreInst>(U);
        return SI && SI->getPointerOperand() != LI &&
-               peekThroughBitcast(SI->getPointerOperand()) != LoadAddr &&
+               InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
+                   LoadAddr &&
               !SI->getPointerOperand()->isSwiftError();
      }))
    return false;
@ -1314,7 +1320,7 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
  return true;
 }

-Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
  Value *Val = SI.getOperand(0);
  Value *Ptr = SI.getOperand(1);

@ -1433,7 +1439,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 /// or:
 ///   *P = v1; if () { *P = v2; }
 /// into a phi node with a store in the successor.
-bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
+bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
  if (!SI.isUnordered())
    return false; // This code has not been audited for volatile/ordered case.

--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@ -32,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include <cassert>
 #include <cstddef>
@ -46,7 +47,7 @@ using namespace PatternMatch;
 /// The specific integer value is used in a context where it is known to be
 /// non-zero.  If this allows us to simplify the computation, do so and return
 /// the new operand, otherwise return null.
-static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
+static Value *simplifyValueKnownNonZero(Value *V, InstCombinerImpl &IC,
                                        Instruction &CxtI) {
  // If V has multiple uses, then we would have to do more analysis to determine
  // if this is safe.  For example, the use could be in dynamically unreached
@ -171,7 +172,7 @@ static Value *foldMulSelectToNegate(BinaryOperator &I,
  return nullptr;
 }

-Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
  if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
                                 SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -423,7 +424,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
  return Changed ? &I : nullptr;
 }

-Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
  BinaryOperator::BinaryOps Opcode = I.getOpcode();
  assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
         "Expected fmul or fdiv");
@ -457,7 +458,7 @@ Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
  if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
                                  I.getFastMathFlags(),
                                  SQ.getWithInstruction(&I)))
@ -637,7 +638,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 /// Fold a divide or remainder with a select instruction divisor when one of the
 /// select operands is zero. In that case, we can use the other select operand
 /// because div/rem by zero is undefined.
-bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
+bool InstCombinerImpl::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
  SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
  if (!SI)
    return false;
@ -738,7 +739,7 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
 /// instructions (udiv and sdiv). It is called by the visitors to those integer
 /// division instructions.
 /// Common integer divide transforms
-Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
  bool IsSigned = I.getOpcode() == Instruction::SDiv;
  Type *Ty = I.getType();
@ -874,7 +875,7 @@ namespace {

 using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
                                           const BinaryOperator &I,
-                                           InstCombiner &IC);
+                                           InstCombinerImpl &IC);

 /// Used to maintain state for visitUDivOperand().
 struct UDivFoldAction {
@ -903,7 +904,8 @@ struct UDivFoldAction {

 // X udiv 2^C -> X >> C
 static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
-                                    const BinaryOperator &I, InstCombiner &IC) {
+                                    const BinaryOperator &I,
+                                    InstCombinerImpl &IC) {
  Constant *C1 = getLogBase2(Op0->getType(), cast<Constant>(Op1));
  if (!C1)
    llvm_unreachable("Failed to constant fold udiv -> logbase2");
@ -916,7 +918,7 @@ static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
 // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
 // X udiv (zext (C1 << N)), where C1 is "1<<C2"  -->  X >> (N+C2)
 static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
-                                InstCombiner &IC) {
+                                InstCombinerImpl &IC) {
  Value *ShiftLeft;
  if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
    ShiftLeft = Op1;
@ -1010,7 +1012,7 @@ static Instruction *narrowUDivURem(BinaryOperator &I,
  return nullptr;
 }

-Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
  if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -1104,7 +1106,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
  if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -1265,7 +1267,7 @@ static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
  return BinaryOperator::CreateFDivFMF(NewC, X, &I);
 }

-Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
  if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
                                  I.getFastMathFlags(),
                                  SQ.getWithInstruction(&I)))
@ -1372,7 +1374,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
 /// instructions (urem and srem). It is called by the visitors to those integer
 /// remainder instructions.
 /// Common integer remainder transforms
-Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);

  // The RHS is known non-zero.
@ -1410,7 +1412,7 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
  if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -1461,7 +1463,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
  if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
                                  SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -1484,7 +1486,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
  // -X srem Y --> -(X srem Y)
  Value *X, *Y;
  if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
-    return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y)); 
+    return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y));

  // If the sign bits of both operands are zero (i.e. we can prove they are
  // unsigned inputs), turn this into a urem.
@ -1533,7 +1535,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
  if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
                                  I.getFastMathFlags(),
                                  SQ.getWithInstruction(&I)))
--- a/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/lib/Transforms/InstCombine/InstCombineNegator.cpp
@ -42,6 +42,7 @@
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <functional>
 #include <tuple>
 #include <type_traits>
@ -430,7 +431,7 @@ LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) {
 }

 LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
-                                      InstCombiner &IC) {
+                                      InstCombinerImpl &IC) {
  ++NegatorTotalNegationsAttempted;
  LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
                    << "\n");
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@ -17,6 +17,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
@ -30,7 +31,7 @@ MaxNumPhis("instcombine-max-num-phis", cl::init(512),
 /// The PHI arguments will be folded into a single operation with a PHI node
 /// as input. The debug location of the single operation will be the merged
 /// locations of the original PHI node arguments.
-void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
+void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
  auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
  Inst->setDebugLoc(FirstInst->getDebugLoc());
  // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc
@ -93,7 +94,7 @@ void InstCombiner::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
 //    ptr_val_inc = ...
 //    ...
 //
-Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldIntegerTypedPHI(PHINode &PN) {
  if (!PN.getType()->isIntegerTy())
    return nullptr;
  if (!PN.hasOneUse())
@ -292,7 +293,7 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {

 /// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
 /// adds all have a single use, turn this into a phi and a single binop.
-Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
  unsigned Opc = FirstInst->getOpcode();
@ -385,7 +386,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
  return NewBinOp;
 }

-Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgGEPIntoPHI(PHINode &PN) {
  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));

  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
@ -494,7 +495,6 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
  return NewGEP;
 }

-
 /// Return true if we know that it is safe to sink the load out of the block
 /// that defines it. This means that it must be obvious the value of the load is
 /// not changed from the point of the load to the end of the block it is in.
@ -540,7 +540,7 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
  return true;
 }

-Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgLoadIntoPHI(PHINode &PN) {
  LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));

  // FIXME: This is overconservative; this transform is allowed in some cases
@ -654,7 +654,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 /// TODO: This function could handle other cast types, but then it might
 /// require special-casing a cast from the 'i1' type. See the comment in
 /// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
-Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
+Instruction *InstCombinerImpl::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
  // We cannot create a new instruction after the PHI if the terminator is an
  // EHPad because there is no valid insertion point.
  if (Instruction *TI = Phi.getParent()->getTerminator())
@ -728,7 +728,7 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
 /// If all operands to a PHI node are the same "unary" operator and they all are
 /// only used by the PHI, PHI together their inputs, and do the operation once,
 /// to the result of the PHI.
-Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+Instruction *InstCombinerImpl::FoldPHIArgOpIntoPHI(PHINode &PN) {
  // We cannot create a new instruction after the PHI if the terminator is an
  // EHPad because there is no valid insertion point.
  if (Instruction *TI = PN.getParent()->getTerminator())
@ -955,7 +955,7 @@ namespace llvm {
 /// TODO: The user of the trunc may be an bitcast to float/double/vector or an
 /// inttoptr.  We should produce new PHIs in the right type.
 ///
-Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
+Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
  // PHIUsers - Keep track of all of the truncated values extracted from a set
  // of PHIs, along with their offset.  These are the things we want to rewrite.
  SmallVector<PHIUsageRecord, 16> PHIUsers;
@ -1203,7 +1203,7 @@ static Value *SimplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,

 // PHINode simplification
 //
-Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
  if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
    return replaceInstUsesWith(PN, V);

--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@ -38,6 +38,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <utility>

@ -57,7 +58,7 @@ static Value *createMinMax(InstCombiner::BuilderTy &Builder,
 /// constant of a binop.
 static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
                                            const TargetLibraryInfo &TLI,
-                                            InstCombiner &IC) {
+                                            InstCombinerImpl &IC) {
  // The select condition must be an equality compare with a constant operand.
  Value *X;
  Constant *C;
@ -279,8 +280,8 @@ static APInt getSelectFoldableConstant(BinaryOperator *I) {
 }

 /// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
-Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
-                                          Instruction *FI) {
+Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                              Instruction *FI) {
  // Don't break up min/max patterns. The hasOneUse checks below prevent that
  // for most cases, but vector min/max with bitcasts can be transformed. If the
  // one-use restrictions are eased for other patterns, we still don't want to
@ -418,8 +419,8 @@ static bool isSelect01(const APInt &C1I, const APInt &C2I) {

 /// Try to fold the select into one of the operands to allow further
 /// optimization.
-Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
-                                            Value *FalseVal) {
+Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                                Value *FalseVal) {
  // See the comment above GetSelectFoldableOperands for a description of the
  // transformation we are doing here.
  if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
@ -1024,9 +1025,9 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
 /// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
 /// Note: if C1 != C2, this will change the icmp constant to the existing
 /// constant operand of the select.
-static Instruction *
-canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
-                               InstCombiner &IC) {
+static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
+                                                   ICmpInst &Cmp,
+                                                   InstCombinerImpl &IC) {
  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
    return nullptr;

@ -1070,7 +1071,7 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
 /// Canonicalize all these variants to 1 pattern.
 /// This makes CSE more likely.
 static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
-                                        InstCombiner &IC) {
+                                        InstCombinerImpl &IC) {
  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
    return nullptr;

@ -1253,7 +1254,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
                                  APInt::getAllOnesValue(
                                      C0->getType()->getScalarSizeInBits()))))
      return nullptr; // Can't do, have all-ones element[s].
-    C0 = AddOne(C0);
+    C0 = InstCombiner::AddOne(C0);
    std::swap(X, Sel1);
    break;
  case ICmpInst::Predicate::ICMP_UGE:
@ -1313,7 +1314,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
                                  APInt::getSignedMaxValue(
                                      C2->getType()->getScalarSizeInBits()))))
      return nullptr; // Can't do, have signed max element[s].
-    C2 = AddOne(C2);
+    C2 = InstCombiner::AddOne(C2);
    LLVM_FALLTHROUGH;
  case ICmpInst::Predicate::ICMP_SGE:
    // Also non-canonical, but here we don't need to change C2,
@ -1360,7 +1361,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
 // and swap the hands of select.
 static Instruction *
 tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
-                                         InstCombiner &IC) {
+                                         InstCombinerImpl &IC) {
  ICmpInst::Predicate Pred;
  Value *X;
  Constant *C0;
@ -1375,7 +1376,7 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,

  // If comparison predicate is non-canonical, then we certainly won't be able
  // to make it canonical; canonicalizeCmpWithConstant() already tried.
-  if (!isCanonicalPredicate(Pred))
+  if (!InstCombiner::isCanonicalPredicate(Pred))
    return nullptr;

  // If the [input] type of comparison and select type are different, lets abort
@ -1403,7 +1404,8 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
    return nullptr;

  // Check the constant we'd have with flipped-strictness predicate.
-  auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0);
+  auto FlippedStrictness =
+      InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
  if (!FlippedStrictness)
    return nullptr;

@ -1426,8 +1428,8 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
 }

 /// Visit a SelectInst that has an ICmpInst as its first operand.
-Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
-                                                  ICmpInst *ICI) {
+Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
+                                                      ICmpInst *ICI) {
  if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ))
    return replaceInstUsesWith(SI, V);

@ -1579,11 +1581,11 @@ static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,

 /// We have an SPF (e.g. a min or max) of an SPF of the form:
 ///   SPF2(SPF1(A, B), C)
-Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
-                                        SelectPatternFlavor SPF1,
-                                        Value *A, Value *B,
-                                        Instruction &Outer,
-                                        SelectPatternFlavor SPF2, Value *C) {
+Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
+                                            SelectPatternFlavor SPF1, Value *A,
+                                            Value *B, Instruction &Outer,
+                                            SelectPatternFlavor SPF2,
+                                            Value *C) {
  if (Outer.getType() != Inner->getType())
    return nullptr;

@ -1900,7 +1902,7 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
  return CallInst::Create(F, {X, Y});
 }

-Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
+Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
  Constant *C;
  if (!match(Sel.getTrueValue(), m_Constant(C)) &&
      !match(Sel.getFalseValue(), m_Constant(C)))
@ -2001,8 +2003,8 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
 /// to a vector select by splatting the condition. A splat may get folded with
 /// other operations in IR and having all operands of a select be vector types
 /// is likely better for vector codegen.
-static Instruction *canonicalizeScalarSelectOfVecs(
-    SelectInst &Sel, InstCombiner &IC) {
+static Instruction *canonicalizeScalarSelectOfVecs(SelectInst &Sel,
+                                                   InstCombinerImpl &IC) {
  auto *Ty = dyn_cast<VectorType>(Sel.getType());
  if (!Ty)
    return nullptr;
@ -2172,7 +2174,7 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
 }

 /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
-Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) {
+Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
  Type *Ty = MinMax1.getType();

  // We are looking for a tree of:
@ -2372,7 +2374,8 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
  bool IsTrueIfSignSet;
  ICmpInst::Predicate Pred;
  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
-      !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType)
+      !InstCombiner::isSignBitCheck(Pred, *C, IsTrueIfSignSet) ||
+      X->getType() != SelType)
    return nullptr;

  // If needed, negate the value that will be the sign argument of the copysign:
@ -2393,7 +2396,7 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
  return CopySign;
 }

-Instruction *InstCombiner::foldVectorSelect(SelectInst &Sel) {
+Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
  auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
  if (!VecTy)
    return nullptr;
@ -2523,7 +2526,7 @@ static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT,
  return nullptr;
 }

-Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
  Value *CondVal = SI.getCondition();
  Value *TrueVal = SI.getTrueValue();
  Value *FalseVal = SI.getFalseValue();
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@ -15,6 +15,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 using namespace llvm;
 using namespace PatternMatch;

@ -31,7 +32,7 @@ using namespace PatternMatch;
 //
 // AnalyzeForSignBitExtraction indicates that we will only analyze whether this
 // pattern has any 2 right-shifts that sum to 1 less than original bit width.
-Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
+Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
    BinaryOperator *Sh0, const SimplifyQuery &SQ,
    bool AnalyzeForSignBitExtraction) {
  // Look for a shift of some instruction, ignore zext of shift amount if any.
@ -360,7 +361,7 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
  return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
 }

-Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
  assert(Op0->getType() == Op1->getType());

@ -420,8 +421,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
 /// Return true if we can simplify two logical (either left or right) shifts
 /// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
 static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
-                                    Instruction *InnerShift, InstCombiner &IC,
-                                    Instruction *CxtI) {
+                                    Instruction *InnerShift,
+                                    InstCombinerImpl &IC, Instruction *CxtI) {
  assert(InnerShift->isLogicalShift() && "Unexpected instruction type");

  // We need constant scalar or constant splat shifts.
@ -472,7 +473,7 @@ static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
 /// where the client will ask if E can be computed shifted right by 64-bits. If
 /// this succeeds, getShiftedValue() will be called to produce the value.
 static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
-                               InstCombiner &IC, Instruction *CxtI) {
+                               InstCombinerImpl &IC, Instruction *CxtI) {
  // We can always evaluate constants shifted.
  if (isa<Constant>(V))
    return true;
@ -608,7 +609,7 @@ static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
 /// When canEvaluateShifted() returns true for an expression, this function
 /// inserts the new computation that produces the shifted value.
 static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
-                              InstCombiner &IC, const DataLayout &DL) {
+                              InstCombinerImpl &IC, const DataLayout &DL) {
  // We can always evaluate constants shifted.
  if (Constant *C = dyn_cast<Constant>(V)) {
    if (isLeftShift)
@ -618,7 +619,7 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
  }

  Instruction *I = cast<Instruction>(V);
-  IC.Worklist.push(I);
+  IC.addToWorklist(I);

  switch (I->getOpcode()) {
  default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
@ -672,8 +673,8 @@ static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
  }
 }

-Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
-                                               BinaryOperator &I) {
+Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
+                                                   BinaryOperator &I) {
  bool isLeftShift = I.getOpcode() == Instruction::Shl;

  const APInt *Op1C;
@ -915,7 +916,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
  return nullptr;
 }

-Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
  const SimplifyQuery Q = SQ.getWithInstruction(&I);

  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
@ -1037,7 +1038,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
  return nullptr;
 }

-Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                  SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
@ -1167,7 +1168,7 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 }

 Instruction *
-InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
+InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract(
    BinaryOperator &OldAShr) {
  assert(OldAShr.getOpcode() == Instruction::AShr &&
         "Must be called with arithmetic right-shift instruction only.");
@ -1235,7 +1236,7 @@ InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract(
  return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
 }

-Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
                                  SQ.getWithInstruction(&I)))
    return replaceInstUsesWith(I, V);
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@ -12,29 +12,18 @@
 //===----------------------------------------------------------------------===//

 #include "InstCombineInternal.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"

 using namespace llvm;
 using namespace llvm::PatternMatch;

 #define DEBUG_TYPE "instcombine"

-namespace {
-
-struct AMDGPUImageDMaskIntrinsic {
-  unsigned Intr;
-};
-
-#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
-#include "InstCombineTables.inc"
-
-} // end anonymous namespace
-
 /// Check to see if the specified operand of the specified instruction is a
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
@ -63,7 +52,7 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,

 /// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
 /// the instruction has any properties that allow us to simplify its operands.
-bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
  unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
  KnownBits Known(BitWidth);
  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
@ -79,22 +68,20 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
-                                        const APInt &DemandedMask,
-                                        KnownBits &Known,
-                                        unsigned Depth) {
+bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                            const APInt &DemandedMask,
+                                            KnownBits &Known, unsigned Depth) {
  Use &U = I->getOperandUse(OpNo);
  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
                                          Depth, I);
  if (!NewVal) return false;
  if (Instruction* OpInst = dyn_cast<Instruction>(U))
    salvageDebugInfo(*OpInst);
-    
+
  replaceUse(U, NewVal);
  return true;
 }

-
 /// This function attempts to replace V with a simpler value based on the
 /// demanded bits. When this function is called, it is known that only the bits
 /// set in DemandedMask of the result of V are ever used downstream.
@ -118,9 +105,10 @@ bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
 /// operands based on the information about what bits are demanded. This returns
 /// some other non-null value if it found out that V is equal to another value
 /// in the context where the specified bits are demanded, but not for all users.
-Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
-                                             KnownBits &Known, unsigned Depth,
-                                             Instruction *CxtI) {
+Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                                 KnownBits &Known,
+                                                 unsigned Depth,
+                                                 Instruction *CxtI) {
  assert(V != nullptr && "Null pointer of Value???");
  assert(Depth <= 6 && "Limit Search Depth");
  uint32_t BitWidth = DemandedMask.getBitWidth();
@ -728,7 +716,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
    bool KnownBitsComputed = false;
    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
      switch (II->getIntrinsicID()) {
-      default: break;
      case Intrinsic::bswap: {
        // If the only bits demanded come from one byte of the bswap result,
        // just shift the input byte into position to eliminate the bswap.
@ -784,39 +771,14 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
        KnownBitsComputed = true;
        break;
      }
-      case Intrinsic::x86_mmx_pmovmskb:
-      case Intrinsic::x86_sse_movmsk_ps:
-      case Intrinsic::x86_sse2_movmsk_pd:
-      case Intrinsic::x86_sse2_pmovmskb_128:
-      case Intrinsic::x86_avx_movmsk_ps_256:
-      case Intrinsic::x86_avx_movmsk_pd_256:
-      case Intrinsic::x86_avx2_pmovmskb: {
-        // MOVMSK copies the vector elements' sign bits to the low bits
-        // and zeros the high bits.
-        unsigned ArgWidth;
-        if (II->getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
-          ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
-        } else {
-          auto Arg = II->getArgOperand(0);
-          auto ArgType = cast<VectorType>(Arg->getType());
-          ArgWidth = ArgType->getNumElements();
-        }
-
-        // If we don't need any of low bits then return zero,
-        // we know that DemandedMask is non-zero already.
-        APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
-        if (DemandedElts.isNullValue())
-          return ConstantInt::getNullValue(VTy);
-
-        // We know that the upper bits are set to zero.
-        Known.Zero.setBitsFrom(ArgWidth);
-        KnownBitsComputed = true;
+      default: {
+        // Handle target specific intrinsics
+        Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
+            *II, DemandedMask, Known, KnownBitsComputed);
+        if (V.hasValue())
+          return V.getValue();
        break;
      }
-      case Intrinsic::x86_sse42_crc32_64_64:
-        Known.Zero.setBitsFrom(32);
-        KnownBitsComputed = true;
-        break;
      }
    }

@ -836,11 +798,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 /// Helper routine of SimplifyDemandedUseBits. It computes Known
 /// bits. It also tries to handle simplifications that can be done based on
 /// DemandedMask, but without modifying the Instruction.
-Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
-                                                     const APInt &DemandedMask,
-                                                     KnownBits &Known,
-                                                     unsigned Depth,
-                                                     Instruction *CxtI) {
+Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
+    Instruction *I, const APInt &DemandedMask, KnownBits &Known, unsigned Depth,
+    Instruction *CxtI) {
  unsigned BitWidth = DemandedMask.getBitWidth();
  Type *ITy = I->getType();

@ -940,7 +900,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
  return nullptr;
 }

-
 /// Helper routine of SimplifyDemandedUseBits. It tries to simplify
 /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
 /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
@ -958,11 +917,9 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
 ///
 /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
 /// not successful.
-Value *
-InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
-                                         Instruction *Shl, const APInt &ShlOp1,
-                                         const APInt &DemandedMask,
-                                         KnownBits &Known) {
+Value *InstCombinerImpl::simplifyShrShlDemandedBits(
+    Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+    const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known) {
  if (!ShlOp1 || !ShrOp1)
    return nullptr; // No-op.

@ -1022,152 +979,6 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
  return nullptr;
 }

-/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
-///
-/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
-///       struct returns.
-Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
-                                                           APInt DemandedElts,
-                                                           int DMaskIdx) {
-
-  auto *IIVTy = cast<VectorType>(II->getType());
-  unsigned VWidth = IIVTy->getNumElements();
-  if (VWidth == 1)
-    return nullptr;
-
-  IRBuilderBase::InsertPointGuard Guard(Builder);
-  Builder.SetInsertPoint(II);
-
-  // Assume the arguments are unchanged and later override them, if needed.
-  SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
-
-  if (DMaskIdx < 0) {
-    // Buffer case.
-
-    const unsigned ActiveBits = DemandedElts.getActiveBits();
-    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
-
-    // Start assuming the prefix of elements is demanded, but possibly clear
-    // some other bits if there are trailing zeros (unused components at front)
-    // and update offset.
-    DemandedElts = (1 << ActiveBits) - 1;
-
-    if (UnusedComponentsAtFront > 0) {
-      static const unsigned InvalidOffsetIdx = 0xf;
-
-      unsigned OffsetIdx;
-      switch (II->getIntrinsicID()) {
-      case Intrinsic::amdgcn_raw_buffer_load:
-        OffsetIdx = 1;
-        break;
-      case Intrinsic::amdgcn_s_buffer_load:
-        // If resulting type is vec3, there is no point in trimming the
-        // load with updated offset, as the vec3 would most likely be widened to
-        // vec4 anyway during lowering.
-        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
-          OffsetIdx = InvalidOffsetIdx;
-        else
-          OffsetIdx = 1;
-        break;
-      case Intrinsic::amdgcn_struct_buffer_load:
-        OffsetIdx = 2;
-        break;
-      default:
-        // TODO: handle tbuffer* intrinsics.
-        OffsetIdx = InvalidOffsetIdx;
-        break;
-      }
-
-      if (OffsetIdx != InvalidOffsetIdx) {
-        // Clear demanded bits and update the offset.
-        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
-        auto *Offset = II->getArgOperand(OffsetIdx);
-        unsigned SingleComponentSizeInBits =
-            getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
-        unsigned OffsetAdd =
-            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
-        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
-        Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
-      }
-    }
-  } else {
-    // Image case.
-
-    ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
-    unsigned DMaskVal = DMask->getZExtValue() & 0xf;
-
-    // Mask off values that are undefined because the dmask doesn't cover them
-    DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
-
-    unsigned NewDMaskVal = 0;
-    unsigned OrigLoadIdx = 0;
-    for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
-      const unsigned Bit = 1 << SrcIdx;
-      if (!!(DMaskVal & Bit)) {
-        if (!!DemandedElts[OrigLoadIdx])
-          NewDMaskVal |= Bit;
-        OrigLoadIdx++;
-      }
-    }
-
-    if (DMaskVal != NewDMaskVal)
-      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
-  }
-
-  unsigned NewNumElts = DemandedElts.countPopulation();
-  if (!NewNumElts)
-    return UndefValue::get(II->getType());
-
-  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
-  // fully supported.
-  if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
-    return nullptr;
-
-  if (NewNumElts >= VWidth && DemandedElts.isMask()) {
-    if (DMaskIdx >= 0)
-      II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
-    return nullptr;
-  }
-
-  // Validate function argument and return types, extracting overloaded types
-  // along the way.
-  SmallVector<Type *, 6> OverloadTys;
-  if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys))
-    return nullptr;
-
-  Module *M = II->getParent()->getParent()->getParent();
-  Type *EltTy = IIVTy->getElementType();
-  Type *NewTy =
-      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
-
-  OverloadTys[0] = NewTy;
-  Function *NewIntrin =
-      Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);
-
-  CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
-  NewCall->takeName(II);
-  NewCall->copyMetadata(*II);
-
-  if (NewNumElts == 1) {
-    return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
-                                       DemandedElts.countTrailingZeros());
-  }
-
-  SmallVector<int, 8> EltMask;
-  unsigned NewLoadIdx = 0;
-  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
-    if (!!DemandedElts[OrigLoadIdx])
-      EltMask.push_back(NewLoadIdx++);
-    else
-      EltMask.push_back(NewNumElts);
-  }
-
-  Value *Shuffle =
-      Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
-
-  return Shuffle;
-}
-
 /// The specified value produces a vector with any number of elements.
 /// This method analyzes which elements of the operand are undef and returns
 /// that information in UndefElts.
@ -1181,10 +992,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
 /// If the information about demanded elements can be used to simplify the
 /// operation, the operation is simplified, then the resultant value is
 /// returned.  This returns null if no change was made.
-Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
-                                                APInt &UndefElts,
-                                                unsigned Depth,
-                                                bool AllowMultipleUsers) {
+Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
+                                                    APInt DemandedElts,
+                                                    APInt &UndefElts,
+                                                    unsigned Depth,
+                                                    bool AllowMultipleUsers) {
  // Cannot analyze scalable type. The number of vector elements is not a
  // compile-time constant.
  if (isa<ScalableVectorType>(V->getType()))
@ -1291,12 +1103,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
    };
    if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
      break;
-    
+
    // Conservatively track the demanded elements back through any vector
    // operands we may have.  We know there must be at least one, or we
    // wouldn't have a vector result to get here. Note that we intentionally
    // merge the undef bits here since gepping with either an undef base or
-    // index results in undef. 
+    // index results in undef.
    for (unsigned i = 0; i < I->getNumOperands(); i++) {
      if (isa<UndefValue>(I->getOperand(i))) {
        // If the entire vector is undefined, just return this info.
@ -1620,227 +1432,19 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
      if (II->getIntrinsicID() == Intrinsic::masked_gather)
        simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
      simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
-      
+
      // Output elements are undefined if the element from both sources are.
      // TODO: can strengthen via mask as well.
      UndefElts = UndefElts2 & UndefElts3;
      break;
    }
-    case Intrinsic::x86_xop_vfrcz_ss:
-    case Intrinsic::x86_xop_vfrcz_sd:
-      // The instructions for these intrinsics are speced to zero upper bits not
-      // pass them through like other scalar intrinsics. So we shouldn't just
-      // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
-      // Instead we should return a zero vector.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return ConstantAggregateZero::get(II->getType());
-      }
-
-      // Only the lower element is used.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // Only the lower element is undefined. The high elements are zero.
-      UndefElts = UndefElts[0];
-      break;
-
-    // Unary scalar-as-vector operations that work column-wise.
-    case Intrinsic::x86_sse_rcp_ss:
-    case Intrinsic::x86_sse_rsqrt_ss:
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-      // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
-      // checks).
-      break;
-
-    // Binary scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0. The low element is a function of both
-    // operands.
-    case Intrinsic::x86_sse_min_ss:
-    case Intrinsic::x86_sse_max_ss:
-    case Intrinsic::x86_sse_cmp_ss:
-    case Intrinsic::x86_sse2_min_sd:
-    case Intrinsic::x86_sse2_max_sd:
-    case Intrinsic::x86_sse2_cmp_sd: {
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-
-      // Lower element is undefined if both lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0])
-        UndefElts.clearBit(0);
-
-      break;
-    }
-
-    // Binary scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0 and the low element comes from operand 1.
-    case Intrinsic::x86_sse41_round_ss:
-    case Intrinsic::x86_sse41_round_sd: {
-      // Don't use the low element of operand 0.
-      APInt DemandedElts2 = DemandedElts;
-      DemandedElts2.clearBit(0);
-      simplifyAndSetOp(II, 0, DemandedElts2, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-
-      // Take the high undef elements from operand 0 and take the lower element
-      // from operand 1.
-      UndefElts.clearBit(0);
-      UndefElts |= UndefElts2[0];
-      break;
-    }
-
-    // Three input scalar-as-vector operations that work column-wise. The high
-    // elements come from operand 0 and the low element is a function of all
-    // three inputs.
-    case Intrinsic::x86_avx512_mask_add_ss_round:
-    case Intrinsic::x86_avx512_mask_div_ss_round:
-    case Intrinsic::x86_avx512_mask_mul_ss_round:
-    case Intrinsic::x86_avx512_mask_sub_ss_round:
-    case Intrinsic::x86_avx512_mask_max_ss_round:
-    case Intrinsic::x86_avx512_mask_min_ss_round:
-    case Intrinsic::x86_avx512_mask_add_sd_round:
-    case Intrinsic::x86_avx512_mask_div_sd_round:
-    case Intrinsic::x86_avx512_mask_mul_sd_round:
-    case Intrinsic::x86_avx512_mask_sub_sd_round:
-    case Intrinsic::x86_avx512_mask_max_sd_round:
-    case Intrinsic::x86_avx512_mask_min_sd_round:
-      simplifyAndSetOp(II, 0, DemandedElts, UndefElts);
-
-      // If lowest element of a scalar op isn't used then use Arg0.
-      if (!DemandedElts[0]) {
-        Worklist.push(II);
-        return II->getArgOperand(0);
-      }
-
-      // Only lower element is used for operand 1 and 2.
-      DemandedElts = 1;
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts2);
-      simplifyAndSetOp(II, 2, DemandedElts, UndefElts3);
-
-      // Lower element is undefined if all three lower elements are undefined.
-      // Consider things like undef&0.  The result is known zero, not undef.
-      if (!UndefElts2[0] || !UndefElts3[0])
-        UndefElts.clearBit(0);
-
-      break;
-
-    case Intrinsic::x86_sse2_packssdw_128:
-    case Intrinsic::x86_sse2_packsswb_128:
-    case Intrinsic::x86_sse2_packuswb_128:
-    case Intrinsic::x86_sse41_packusdw:
-    case Intrinsic::x86_avx2_packssdw:
-    case Intrinsic::x86_avx2_packsswb:
-    case Intrinsic::x86_avx2_packusdw:
-    case Intrinsic::x86_avx2_packuswb:
-    case Intrinsic::x86_avx512_packssdw_512:
-    case Intrinsic::x86_avx512_packsswb_512:
-    case Intrinsic::x86_avx512_packusdw_512:
-    case Intrinsic::x86_avx512_packuswb_512: {
-      auto *Ty0 = II->getArgOperand(0)->getType();
-      unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
-      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
-
-      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
-      unsigned VWidthPerLane = VWidth / NumLanes;
-      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
-
-      // Per lane, pack the elements of the first input and then the second.
-      // e.g.
-      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
-      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
-      for (int OpNum = 0; OpNum != 2; ++OpNum) {
-        APInt OpDemandedElts(InnerVWidth, 0);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          unsigned LaneIdx = Lane * VWidthPerLane;
-          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
-            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
-            if (DemandedElts[Idx])
-              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
-          }
-        }
-
-        // Demand elements from the operand.
-        APInt OpUndefElts(InnerVWidth, 0);
-        simplifyAndSetOp(II, OpNum, OpDemandedElts, OpUndefElts);
-
-        // Pack the operand's UNDEF elements, one lane at a time.
-        OpUndefElts = OpUndefElts.zext(VWidth);
-        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
-          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
-          LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
-          UndefElts |= LaneElts;
-        }
-      }
-      break;
-    }
-
-    // PSHUFB
-    case Intrinsic::x86_ssse3_pshuf_b_128:
-    case Intrinsic::x86_avx2_pshuf_b:
-    case Intrinsic::x86_avx512_pshuf_b_512:
-    // PERMILVAR
-    case Intrinsic::x86_avx_vpermilvar_ps:
-    case Intrinsic::x86_avx_vpermilvar_ps_256:
-    case Intrinsic::x86_avx512_vpermilvar_ps_512:
-    case Intrinsic::x86_avx_vpermilvar_pd:
-    case Intrinsic::x86_avx_vpermilvar_pd_256:
-    case Intrinsic::x86_avx512_vpermilvar_pd_512:
-    // PERMV
-    case Intrinsic::x86_avx2_permd:
-    case Intrinsic::x86_avx2_permps: {
-      simplifyAndSetOp(II, 1, DemandedElts, UndefElts);
-      break;
-    }
-
-    // SSE4A instructions leave the upper 64-bits of the 128-bit result
-    // in an undefined state.
-    case Intrinsic::x86_sse4a_extrq:
-    case Intrinsic::x86_sse4a_extrqi:
-    case Intrinsic::x86_sse4a_insertq:
-    case Intrinsic::x86_sse4a_insertqi:
-      UndefElts.setHighBits(VWidth / 2);
-      break;
-    case Intrinsic::amdgcn_buffer_load:
-    case Intrinsic::amdgcn_buffer_load_format:
-    case Intrinsic::amdgcn_raw_buffer_load:
-    case Intrinsic::amdgcn_raw_buffer_load_format:
-    case Intrinsic::amdgcn_raw_tbuffer_load:
-    case Intrinsic::amdgcn_s_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_load_format:
-    case Intrinsic::amdgcn_struct_tbuffer_load:
-    case Intrinsic::amdgcn_tbuffer_load:
-      return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
    default: {
-      if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
-        return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
-
+      // Handle target specific intrinsics
+      Optional<Value *> V = targetSimplifyDemandedVectorEltsIntrinsic(
+          *II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+          simplifyAndSetOp);
+      if (V.hasValue())
+        return V.getValue();
      break;
    }
    } // switch on IntrinsicID
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@ -35,6 +35,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@ -85,7 +86,8 @@ static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
 // If we have a PHI node with a vector type that is only used to feed
 // itself and be an operand of extractelement at a constant location,
 // try to replace the PHI of the vector type with a PHI of a scalar type.
-Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
+Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
+                                            PHINode *PN) {
  SmallVector<Instruction *, 2> Extracts;
  // The users we want the PHI to have are:
  // 1) The EI ExtractElement (we already know this)
@ -321,7 +323,7 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
  return UnionUsedElts;
 }

-Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
  Value *SrcVec = EI.getVectorOperand();
  Value *Index = EI.getIndexOperand();
  if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
@ -531,7 +533,7 @@ static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
 /// shufflevector to replace one or more insert/extract pairs.
 static void replaceExtractElements(InsertElementInst *InsElt,
                                   ExtractElementInst *ExtElt,
-                                   InstCombiner &IC) {
+                                   InstCombinerImpl &IC) {
  VectorType *InsVecType = InsElt->getType();
  VectorType *ExtVecType = ExtElt->getVectorOperandType();
  unsigned NumInsElts = InsVecType->getNumElements();
@ -614,7 +616,7 @@ using ShuffleOps = std::pair<Value *, Value *>;

 static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
                                         Value *PermittedRHS,
-                                         InstCombiner &IC) {
+                                         InstCombinerImpl &IC) {
  assert(V->getType()->isVectorTy() && "Invalid shuffle!");
  unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();

@ -699,7 +701,7 @@ static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
 /// first one, making the first one redundant.
 /// It should be transformed to:
 ///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
-Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
+Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
  bool IsRedundant = false;
  ArrayRef<unsigned int> FirstIndices = I.getIndices();

@ -1041,7 +1043,7 @@ static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
  return nullptr;
 }

-Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
  Value *VecOp    = IE.getOperand(0);
  Value *ScalarOp = IE.getOperand(1);
  Value *IdxOp    = IE.getOperand(2);
@ -1525,7 +1527,7 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
      is_contained(Mask, UndefMaskElem) &&
      (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
  if (MightCreatePoisonOrUB)
-    NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true);
+    NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);

  // shuf (bop X, C), X, M --> bop X, C'
  // shuf X, (bop X, C), M --> bop X, C'
@ -1652,7 +1654,8 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
      is_contained(Mask, UndefMaskElem) &&
      (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
  if (MightCreatePoisonOrUB)
-    NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1);
+    NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
+                                                       ConstantsAreOp1);

  Value *V;
  if (X == Y) {
@ -1823,7 +1826,7 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
 /// Try to replace a shuffle with an insertelement or try to replace a shuffle
 /// operand with the operand of an insertelement.
 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
-                                          InstCombiner &IC) {
+                                          InstCombinerImpl &IC) {
  Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
  SmallVector<int, 16> Mask;
  Shuf.getShuffleMask(Mask);
@ -1974,7 +1977,7 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
  return new ShuffleVectorInst(X, Y, NewMask);
 }

-Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
  Value *LHS = SVI.getOperand(0);
  Value *RHS = SVI.getOperand(1);
  SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@ -59,6 +59,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
@ -160,7 +161,41 @@ MaxArraySize("instcombine-maxarray-size", cl::init(1024),
 static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
                                               cl::Hidden, cl::init(true));

-Value *InstCombiner::EmitGEPOffset(User *GEP) {
+Optional<Instruction *>
+InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.instCombineIntrinsic(*this, II);
+  }
+  return None;
+}
+
+Optional<Value *> InstCombiner::targetSimplifyDemandedUseBitsIntrinsic(
+    IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
+    bool &KnownBitsComputed) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.simplifyDemandedUseBitsIntrinsic(*this, II, DemandedMask, Known,
+                                                KnownBitsComputed);
+  }
+  return None;
+}
+
+Optional<Value *> InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic(
+    IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2,
+    APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) {
+  // Handle target specific intrinsics
+  if (II.getCalledFunction()->isTargetIntrinsic()) {
+    return TTI.simplifyDemandedVectorEltsIntrinsic(
+        *this, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
+        SimplifyAndSetOp);
+  }
+  return None;
+}
+
+Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
  return llvm::EmitGEPOffset(&Builder, DL, GEP);
 }

@ -173,8 +208,8 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
 /// legal to convert to, in order to open up more combining opportunities.
 /// NOTE: this treats i8, i16 and i32 specially, due to them being so common
 /// from frontend languages.
-bool InstCombiner::shouldChangeType(unsigned FromWidth,
-                                    unsigned ToWidth) const {
+bool InstCombinerImpl::shouldChangeType(unsigned FromWidth,
+                                        unsigned ToWidth) const {
  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);

@ -201,7 +236,7 @@ bool InstCombiner::shouldChangeType(unsigned FromWidth,
 /// to a larger illegal type. i1 is always treated as a legal type because it is
 /// a fundamental type in IR, and there are many specialized optimizations for
 /// i1 types.
-bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
+bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
  // TODO: This could be extended to allow vectors. Datalayout changes might be
  // needed to properly support that.
  if (!From->isIntegerTy() || !To->isIntegerTy())
@ -269,7 +304,8 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
 /// cast to eliminate one of the associative operations:
 /// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
 /// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
-static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
+static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1,
+                                   InstCombinerImpl &IC) {
  auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
  if (!Cast || !Cast->hasOneUse())
    return false;
@ -327,7 +363,7 @@ static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
 ///  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
 ///  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
 ///     if C1 and C2 are constants.
-bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
+bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
  Instruction::BinaryOps Opcode = I.getOpcode();
  bool Changed = false;

@ -555,9 +591,10 @@ getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,

 /// This tries to simplify binary operations by factorizing out common terms
 /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
-Value *InstCombiner::tryFactorization(BinaryOperator &I,
-                                      Instruction::BinaryOps InnerOpcode,
-                                      Value *A, Value *B, Value *C, Value *D) {
+Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
+                                          Instruction::BinaryOps InnerOpcode,
+                                          Value *A, Value *B, Value *C,
+                                          Value *D) {
  assert(A && B && C && D && "All values must be provided");

  Value *V = nullptr;
@ -660,7 +697,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I,
 /// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
 /// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
 /// Returns the simplified value, or null if it didn't simplify.
-Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
+Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
@ -774,8 +811,9 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
  return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }

-Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
-                                                    Value *LHS, Value *RHS) {
+Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
+                                                        Value *LHS,
+                                                        Value *RHS) {
  Value *A, *B, *C, *D, *E, *F;
  bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
  bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
@ -827,7 +865,7 @@ Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,

 /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
 /// constant zero (which is the 'negate' form).
-Value *InstCombiner::dyn_castNegVal(Value *V) const {
+Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
  Value *NegV;
  if (match(V, m_Neg(m_Value(NegV))))
    return NegV;
@ -888,7 +926,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
  return RI;
 }

-Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
+Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
+                                                SelectInst *SI) {
  // Don't modify shared select instructions.
  if (!SI->hasOneUse())
    return nullptr;
@ -983,7 +1022,7 @@ static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
  return RI;
 }

-Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
+Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
  unsigned NumPHIValues = PN->getNumIncomingValues();
  if (NumPHIValues == 0)
    return nullptr;
@ -1125,7 +1164,7 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
  return replaceInstUsesWith(I, NewPN);
 }

-Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
+Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
  if (!isa<Constant>(I.getOperand(1)))
    return nullptr;

@ -1143,8 +1182,9 @@ Instruction *InstCombiner::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
 /// is a sequence of GEP indices into the pointed type that will land us at the
 /// specified offset. If so, fill them into NewIndices and return the resultant
 /// element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
-                                        SmallVectorImpl<Value *> &NewIndices) {
+Type *
+InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+                                      SmallVectorImpl<Value *> &NewIndices) {
  Type *Ty = PtrTy->getElementType();
  if (!Ty->isSized())
    return nullptr;
@ -1213,7 +1253,7 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {

 /// Return a value X such that Val = X * Scale, or null if none.
 /// If the multiplication is known not to overflow, then NoSignedWrap is set.
-Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
+Value *InstCombinerImpl::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
  assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
  assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
         Scale.getBitWidth() && "Scale not compatible with value!");
@ -1453,7 +1493,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
  } while (true);
 }

-Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
+Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
  // FIXME: some of this is likely fine for scalable vectors
  if (!isa<FixedVectorType>(Inst.getType()))
    return nullptr;
@ -1675,7 +1715,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
 /// Try to narrow the width of a binop if at least 1 operand is an extend of
 /// of a value. This requires a potentially expensive known bits check to make
 /// sure the narrow op does not overflow.
-Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) {
+Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
  // We need at least one extended operand.
  Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);

@ -1764,7 +1804,7 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
  return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }

-Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
  SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
  Type *GEPType = GEP.getType();
  Type *GEPEltType = GEP.getSourceElementType();
@ -2516,7 +2556,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
  return true;
 }

-Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
+Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
  // If we have a malloc call which is only used in any amount of comparisons to
  // null and free calls, delete the calls and replace the comparisons with true
  // or false as appropriate.
@ -2675,7 +2715,7 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
  return &FI;
 }

-Instruction *InstCombiner::visitFree(CallInst &FI) {
+Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
  Value *Op = FI.getArgOperand(0);

  // free undef -> unreachable.
@ -2716,7 +2756,7 @@ static bool isMustTailCall(Value *V) {
  return false;
 }

-Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
+Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
  if (RI.getNumOperands() == 0) // ret void
    return nullptr;

@ -2739,7 +2779,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
  return nullptr;
 }

-Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
+Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) {
  assert(BI.isUnconditional() && "Only for unconditional branches.");

  // If this store is the second-to-last instruction in the basic block
@ -2768,7 +2808,7 @@ Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
  return nullptr;
 }

-Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
  if (BI.isUnconditional())
    return visitUnconditionalBranchInst(BI);

@ -2804,7 +2844,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
  return nullptr;
 }

-Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) {
  Value *Cond = SI.getCondition();
  Value *Op0;
  ConstantInt *AddRHS;
@ -2835,7 +2875,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
  unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);

  // Shrink the condition operand if the new type is smaller than the old type.
-  // But do not shrink to a non-standard type, because backend can't generate 
+  // But do not shrink to a non-standard type, because backend can't generate
  // good code for that yet.
  // TODO: We can make it aggressive again after fixing PR39569.
  if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
@ -2854,7 +2894,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
  return nullptr;
 }

-Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
  Value *Agg = EV.getAggregateOperand();

  if (!EV.hasIndices())
@ -3015,7 +3055,7 @@ static bool shorter_filter(const Value *LHS, const Value *RHS) {
    cast<ArrayType>(RHS->getType())->getNumElements();
 }

-Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
  // The logic here should be correct for any real-world personality function.
  // However if that turns out not to be true, the offending logic can always
  // be conditioned on the personality function, like the catch-all logic is.
@ -3324,7 +3364,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
  return nullptr;
 }

-Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
+Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
  Value *Op0 = I.getOperand(0);

  if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
@ -3435,7 +3475,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
  return true;
 }

-bool InstCombiner::run() {
+bool InstCombinerImpl::run() {
  while (!Worklist.isEmpty()) {
    // Walk deferred instructions in reverse order, and push them to the
    // worklist, which means they'll end up popped from the worklist in-order.
@ -3718,8 +3758,8 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,

 static bool combineInstructionsOverFunction(
    Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
-    AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
-    OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+    AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
+    DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
    ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
  auto &DL = F.getParent()->getDataLayout();
  MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
@ -3763,8 +3803,8 @@ static bool combineInstructionsOverFunction(

    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);

-    InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
-                    AC, TLI, DT, ORE, BFI, PSI, DL, LI);
+    InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
+                        ORE, BFI, PSI, DL, LI);
    IC.MaxArraySizeForCombine = MaxArraySize;

    if (!IC.run())
@ -3787,6 +3827,7 @@ PreservedAnalyses InstCombinePass::run(Function &F,
  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);

  auto *LI = AM.getCachedResult<LoopAnalysis>(F);

@ -3797,8 +3838,8 @@ PreservedAnalyses InstCombinePass::run(Function &F,
  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
      &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;

-  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                       PSI, MaxIterations, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
+                                       BFI, PSI, MaxIterations, LI))
    // No changes, all analyses are preserved.
    return PreservedAnalyses::all();

@ -3816,6 +3857,7 @@ void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.addRequired<AAResultsWrapperPass>();
  AU.addRequired<AssumptionCacheTracker>();
  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
  AU.addRequired<DominatorTreeWrapperPass>();
  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  AU.addPreserved<DominatorTreeWrapperPass>();
@ -3834,6 +3876,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

@ -3847,8 +3890,8 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
      &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
      nullptr;

-  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                         PSI, MaxIterations, LI);
+  return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
+                                         BFI, PSI, MaxIterations, LI);
 }

 char InstructionCombiningPass::ID = 0;
@ -3867,6 +3910,7 @@ INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
                      "Combine redundant instructions", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
--- a/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
+++ b/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s

 declare <16 x i1> @llvm.arm.mve.vctp8(i32)
 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
--- a/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll
+++ b/test/CodeGen/Thumb2/mve-intrinsics/vadc-multiple.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -instcombine -S %s | FileCheck --check-prefix=IR %s
-; RUN: opt -instcombine    %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main -S %s | FileCheck --check-prefix=IR %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main    %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -O3 -o - | FileCheck --check-prefix=ASM %s

 %struct.foo = type { [2 x <4 x i32>] }

--- a/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
+++ b/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s
+; RUN: opt -instcombine -mtriple=thumbv8.1m.main-none-eabi %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s

 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

@ -42,4 +42,3 @@ declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
 declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>)
 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
-
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@ -1,4 +1,4 @@
-; RUN: opt -S -instcombine %s | FileCheck %s
+; RUN: opt -S -instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s

 ; --------------------------------------------------------------------
 ; llvm.amdgcn.buffer.load
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S < %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -instcombine -S < %s | FileCheck %s

 ; --------------------------------------------------------------------
 ; llvm.amdgcn.rcp
--- a/test/Transforms/InstCombine/AMDGPU/ldexp.ll
+++ b/test/Transforms/InstCombine/AMDGPU/ldexp.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s

 define float @ldexp_f32_undef_undef() {
 ; CHECK-LABEL: @ldexp_f32_undef_undef(
--- a/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
+++ b/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S -o - %s | FileCheck %s
+; RUN: opt -instcombine -S -mtriple=arm -o - %s | FileCheck %s

 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

--- a/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
+++ b/test/Transforms/InstCombine/ARM/neon-intrinsics.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=arm -S | FileCheck %s

 ; The alignment arguments for NEON load/store intrinsics can be increased
 ; by instcombine.  Check for this.
--- a/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ b/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@ -6,11 +6,11 @@

 ; RUN: cat %s > %t.ftz
 ; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "preserve-sign" }' >> %t.ftz
-; RUN: opt < %t.ftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ
+; RUN: opt < %t.ftz -instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s --check-prefix=CHECK --check-prefix=FTZ

 ; RUN: cat %s > %t.noftz
 ; RUN: echo 'attributes #0 = { "denormal-fp-math-f32" = "ieee" }' >> %t.noftz
-; RUN: opt < %t.noftz -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ
+; RUN: opt < %t.noftz -instcombine -mtriple=nvptx64-nvidia-cuda -S | FileCheck %s --check-prefix=CHECK --check-prefix=NOFTZ

 ; We handle nvvm intrinsics with ftz variants as follows:
 ;  - If the module is in ftz mode, the ftz variant is transformed into the
--- a/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
+++ b/test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 ; The test checks the folding of cmp(sub(a,b),0) into cmp(a,b).

--- a/test/Transforms/InstCombine/X86/addcarry.ll
+++ b/test/Transforms/InstCombine/X86/addcarry.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
 declare { i8, i64 } @llvm.x86.addcarry.64(i8, i64, i64)
@ -35,4 +35,3 @@ define i64 @no_carryin_i64(i64 %x, i64 %y, i8* %p) {
  %r = extractvalue { i8, i64 } %s, 1
  ret i64 %r
 }
-
--- a/test/Transforms/InstCombine/X86/clmulqdq.ll
+++ b/test/Transforms/InstCombine/X86/clmulqdq.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8)
 declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8)
--- a/test/Transforms/InstCombine/X86/x86-avx2.ll
+++ b/test/Transforms/InstCombine/X86/x86-avx2.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 ; Verify that instcombine is able to fold identity shuffles.
--- a/test/Transforms/InstCombine/X86/x86-avx512.ll
+++ b/test/Transforms/InstCombine/X86/x86-avx512.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
--- a/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
+++ b/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone
 declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
--- a/test/Transforms/InstCombine/X86/x86-insertps.ll
+++ b/test/Transforms/InstCombine/X86/x86-insertps.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone

--- a/test/Transforms/InstCombine/X86/x86-masked-memops.ll
+++ b/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 ;; MASKED LOADS

@ -325,4 +325,3 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>)
 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>)

 declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*)
-
--- a/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/test/Transforms/InstCombine/X86/x86-movmsk.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

--- a/test/Transforms/InstCombine/X86/x86-pack.ll
+++ b/test/Transforms/InstCombine/X86/x86-pack.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 ;
 ; UNDEF Elts
--- a/test/Transforms/InstCombine/X86/x86-pshufb.ll
+++ b/test/Transforms/InstCombine/X86/x86-pshufb.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 ; Verify that instcombine is able to fold identity shuffles.

--- a/test/Transforms/InstCombine/X86/x86-sse.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 define float @test_rcp_ss_0(float %a) {
--- a/test/Transforms/InstCombine/X86/x86-sse2.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse2.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 define double @test_sqrt_sd_0(double %a) {
--- a/test/Transforms/InstCombine/X86/x86-sse41.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse41.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) {
--- a/test/Transforms/InstCombine/X86/x86-sse4a.ll
+++ b/test/Transforms/InstCombine/X86/x86-sse4a.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 ;
 ; EXTRQ
--- a/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 define i16 @test1(float %f) {
--- a/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 ;
--- a/test/Transforms/InstCombine/X86/x86-vpermil.ll
+++ b/test/Transforms/InstCombine/X86/x86-vpermil.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

 ; Verify that instcombine is able to fold identity shuffles.
--- a/test/Transforms/InstCombine/X86/x86-xop.ll
+++ b/test/Transforms/InstCombine/X86/x86-xop.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s

 define <2 x double> @test_vfrcz_sd(<2 x double> %a) {
 ; CHECK-LABEL: @test_vfrcz_sd(