[Vectorizers][TTI] remove option to bypass creation of vector reduction intrinsics

The vector reduction intrinsics started life as experimental ops, so backend support was lacking. As part of promoting them to 1st-class intrinsics, however, codegen support was added/improved: D58015 D90247 So I think it is safe to now remove this complication from IR. Note that we still have an IR-level codegen expansion pass for these as discussed in D95690. Removing that is another step in simplifying the logic. Also note that x86 was already unconditionally forming reductions in IR, so there should be no difference for x86. I spot checked a couple of the tests here by running them through opt+llc and did not see any asm diffs. If we do find functional differences for other targets, it should be possible to (at least temporarily) restore the shuffle IR with the ExpandReductions IR pass. Differential Revision: https://reviews.llvm.org/D96552
2025-01-31 20:51:52 +01:00 · 2021-02-12 08:07:29 -05:00 · 2021-02-12 08:07:29 -05:00 · d1a8bb697a
commit d1a8bb697a
parent b4a7561ae7
33 changed files with 120 additions and 391 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -1326,11 +1326,6 @@ public:
    bool NoNaN;    ///< If op is an fp min/max, whether NaNs may be present.
  };

-  /// \returns True if the target wants to handle the given reduction idiom in
-  /// the intrinsics form instead of the shuffle form.
-  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                             ReductionFlags Flags) const;
-
  /// \returns True if the target prefers reductions in loop.
  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                             ReductionFlags Flags) const;
@ -1652,8 +1647,6 @@ public:
  virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                        unsigned ChainSizeInBytes,
                                        VectorType *VecTy) const = 0;
-  virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                                     ReductionFlags) const = 0;
  virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                                     ReductionFlags) const = 0;
  virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@ -2183,10 +2176,6 @@ public:
                                VectorType *VecTy) const override {
    return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
  }
-  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                             ReductionFlags Flags) const override {
-    return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
-  }
  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                             ReductionFlags Flags) const override {
    return Impl.preferInLoopReduction(Opcode, Ty, Flags);
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -700,11 +700,6 @@ public:
    return VF;
  }

-  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                             TTI::ReductionFlags Flags) const {
-    return false;
-  }
-
  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                             TTI::ReductionFlags Flags) const {
    return false;
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -1050,11 +1050,6 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
  return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }

-bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                                                ReductionFlags Flags) const {
-  return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
-}
-
 bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                                ReductionFlags Flags) const {
  return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -1089,31 +1089,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
  return Considerable;
 }

-bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                                           TTI::ReductionFlags Flags) const {
-  auto *VTy = cast<VectorType>(Ty);
-  unsigned ScalarBits = Ty->getScalarSizeInBits();
-  switch (Opcode) {
-  case Instruction::FAdd:
-  case Instruction::FMul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Mul:
-    return false;
-  case Instruction::Add:
-    return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
-  case Instruction::ICmp:
-    return (ScalarBits < 64) &&
-           (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
-  case Instruction::FCmp:
-    return Flags.NoNaN;
-  default:
-    llvm_unreachable("Unhandled reduction opcode");
-  }
-  return false;
-}
-
 int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                                           bool IsPairwise, bool IsUnsigned,
                                           TTI::TargetCostKind CostKind) {
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -266,9 +266,6 @@ public:

  bool supportsScalableVectors() const { return ST->hasSVE(); }

-  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                             TTI::ReductionFlags Flags) const;
-
  int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                 bool IsPairwiseForm,
                                 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -2091,11 +2091,6 @@ void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
  BaseT::getPeelingPreferences(L, SE, PP);
 }

-bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                                       TTI::ReductionFlags Flags) const {
-  return ST->hasMVEIntegerOps();
-}
-
 bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
                                       TTI::ReductionFlags Flags) const {
  if (!ST->hasMVEIntegerOps())
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@ -186,9 +186,6 @@ public:
  int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
                     VectorType *SubTp);

-  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                             TTI::ReductionFlags Flags) const;
-
  bool preferInLoopReduction(unsigned Opcode, Type *Ty,
                             TTI::ReductionFlags Flags) const;

--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@ -231,14 +231,6 @@ public:
                                                    bool IsZeroCmp) const;
  bool enableInterleavedAccessVectorization();

-  /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded
-  /// into shuffles and vector math/logic by the backend
-  /// (see TTI::shouldExpandReduction)
-  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
-                             TTI::ReductionFlags Flags) const {
-    return true;
-  }
-
 private:
  int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
                      Align Alignment, unsigned AddressSpace);
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@ -54,11 +54,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;

-static cl::opt<bool> ForceReductionIntrinsic(
-    "force-reduction-intrinsics", cl::Hidden,
-    cl::desc("Force creating reduction intrinsics for testing."),
-    cl::init(false));
-
 #define DEBUG_TYPE "loop-utils"

 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
@ -1025,14 +1020,10 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
                                         const TargetTransformInfo *TTI,
                                         Value *Src, RecurKind RdxKind,
                                         ArrayRef<Value *> RedOps) {
-  unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
  TargetTransformInfo::ReductionFlags RdxFlags;
  RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax ||
                     RdxKind == RecurKind::FMax;
  RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin;
-  if (!ForceReductionIntrinsic &&
-      !TTI->useReductionIntrinsic(Opcode, Src->getType(), RdxFlags))
-    return getShuffleReduction(Builder, Src, Opcode, RdxKind, RedOps);

  auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
  switch (RdxKind) {
--- a/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@ -5,10 +5,9 @@ target triple = "aarch64--linux-gnu"
@b = common local_unnamed_addr global i32 0, align 4
@a = common local_unnamed_addr global i16* null, align 8

-; Function Attrs: norecurse nounwind readonly
 define i32 @fn1() local_unnamed_addr #0 {
-; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
-; CHECK-NOT: @llvm.vector.reduce
+; We expect the backend to expand all reductions.
+; CHECK: @llvm.vector.reduce
 entry:
  %0 = load i32, i32* @b, align 4, !tbaa !1
  %cmp40 = icmp sgt i32 %0, 0
--- a/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
+++ b/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@ -62,9 +62,7 @@ define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
 ; GFX9-NEXT:    [[BIN_RDX18:%.*]] = fadd fast <2 x half> [[TMP21]], [[BIN_RDX17]]
 ; GFX9-NEXT:    [[BIN_RDX19:%.*]] = fadd fast <2 x half> [[TMP22]], [[BIN_RDX18]]
 ; GFX9-NEXT:    [[BIN_RDX20:%.*]] = fadd fast <2 x half> [[TMP23]], [[BIN_RDX19]]
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x half> [[BIN_RDX20]], <2 x half> poison, <2 x i32> <i32 1, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX21:%.*]] = fadd fast <2 x half> [[BIN_RDX20]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[TMP25:%.*]] = extractelement <2 x half> [[BIN_RDX21]], i32 0
+; GFX9-NEXT:    [[TMP25:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX20]])
 ; GFX9-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; GFX9:       scalar.ph:
 ; GFX9-NEXT:    br label [[FOR_BODY:%.*]]
@ -132,9 +130,7 @@ define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
 ; VI-NEXT:    [[BIN_RDX18:%.*]] = fadd fast <2 x half> [[TMP21]], [[BIN_RDX17]]
 ; VI-NEXT:    [[BIN_RDX19:%.*]] = fadd fast <2 x half> [[TMP22]], [[BIN_RDX18]]
 ; VI-NEXT:    [[BIN_RDX20:%.*]] = fadd fast <2 x half> [[TMP23]], [[BIN_RDX19]]
-; VI-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x half> [[BIN_RDX20]], <2 x half> poison, <2 x i32> <i32 1, i32 undef>
-; VI-NEXT:    [[BIN_RDX21:%.*]] = fadd fast <2 x half> [[BIN_RDX20]], [[RDX_SHUF]]
-; VI-NEXT:    [[TMP25:%.*]] = extractelement <2 x half> [[BIN_RDX21]], i32 0
+; VI-NEXT:    [[TMP25:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX20]])
 ; VI-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VI:       scalar.ph:
 ; VI-NEXT:    br label [[FOR_BODY:%.*]]
--- a/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ b/test/Transforms/LoopVectorize/ARM/sphinx.ll
@ -67,9 +67,7 @@ define i32 @test(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
--- a/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
+++ b/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
@ -22,9 +22,7 @@ define dso_local double @test(float* %Arr) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[DOTLCSSA]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[DOTLCSSA]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
 ; CHECK-NEXT:    ret double [[TMP7]]
 ;
 entry:
--- a/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
+++ b/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll
@ -20,9 +20,7 @@ define dso_local double @test(float* %Arr) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP5]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP5]])
 ; CHECK-NEXT:    ret double [[TMP7]]
 ;
 entry:
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@ -14,8 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
 ; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
 ; CHECK: middle.block
-; CHECK:   add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[BR_LOC:[0-9]+]]
-; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[BR_LOC]]
+; CHECK:   call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %{{.*}}), !dbg ![[BR_LOC:[0-9]+]]
 ; CHECK: for.body
 ; CHECK: br i1{{.*}}, label %for.body,{{.*}}, !dbg ![[BR_LOC]],
 ; CHECK: ![[BR_LOC]] = !DILocation(line: 5,
--- a/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll
@ -143,7 +143,7 @@ scalar.body:
 ; CHECK:         [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; Check also that the casts were not moved needlessly.
 ; CHECK:         sitofp <4 x i16> [[L1]] to <4 x double>
-; CHECK:         sitofp <4 x i16> [[SHUF]] to <4 x double> 
+; CHECK:         sitofp <4 x i16> [[SHUF]] to <4 x double>
 ; CHECK:       middle.block:
 ; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
 ; CHECK:       scalar.ph:
@ -357,8 +357,8 @@ for.end:
 }

 ; We vectorize this first order recurrence, by generating two
-; extracts for the phi `val.phi` - one at the last index and 
-; another at the second last index. We need these 2 extracts because 
+; extracts for the phi `val.phi` - one at the last index and
+; another at the second last index. We need these 2 extracts because
 ; the first order recurrence phi is used outside the loop, so we require the phi
 ; itself and not its update (addx).
 ; UNROLL-NO-IC-LABEL: extract_second_last_iteration
@ -705,16 +705,12 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; CHECK-NEXT:    [[TMP21]] = phi <4 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP20]], [[PRED_UDIV_IF8]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP21]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP24]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF10]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[BIN_RDX11]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[BB2:%.*]]
@ -834,17 +830,13 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
 ; CHECK-NEXT:    store i32 [[TMP4]], i32* [[TMP38]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
 ; CHECK:       pred.store.continue16:
-; CHECK-NEXT:    [[TMP39:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP39]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF17:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX18:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF17]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[BIN_RDX18]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]])
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[BB2:%.*]]
--- a/test/Transforms/LoopVectorize/fix-reduction-dbg.ll
+++ b/test/Transforms/LoopVectorize/fix-reduction-dbg.ll
@ -7,11 +7,7 @@
 ; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL:[0-9]+]]
 ; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
 ; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
-; CHECK-NEXT: %{{.*}}= shufflevector <4 x i32>{{.*}}, !dbg ![[DL]]
-; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
-; CHECK-NEXT: %{{.*}}= shufflevector <4 x i32>{{.*}}, !dbg ![[DL]]
-; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
-; CHECK-NEXT: %{{.*}}= extractelement <4 x i32>{{.*}}, !dbg ![[DL]]
+; CHECK-NEXT: %{{.*}}= call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{.*}}), !dbg ![[DL]]
 ; CHECK-NEXT: %{{.*}}= icmp eq i64{{.*}}, !dbg ![[DL]]
 ; CHECK-NEXT: br i1 %{{.*}}, !dbg ![[DL]]
 ; CHECK: ![[DL]] = !DILocation(line: 5,
--- a/test/Transforms/LoopVectorize/flags.ll
+++ b/test/Transforms/LoopVectorize/flags.ll
@ -56,8 +56,7 @@ define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
 ; CHECK: load <4 x float>
 ; CHECK: fadd fast <4 x float>
 ; CHECK: br
-; CHECK: fadd fast <4 x float>
-; CHECK: fadd fast <4 x float>
+; CHECK: call fast float @llvm.vector.reduce.fadd.v4f32
 define float @fast_math(float* noalias %s) {
 entry:
  br label %for.body
--- a/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
+++ b/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
@ -68,13 +68,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp olt <4 x float> [[TMP5]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 65536, 65536
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
--- a/test/Transforms/LoopVectorize/if-reduction.ll
+++ b/test/Transforms/LoopVectorize/if-reduction.ll
@ -387,10 +387,10 @@ for.end:                                          ; preds = %for.body, %entry
 }

 ; Double pattern:
-; Check that is not vectorized if fp-instruction has no fast-math property. 
+; Check that is not vectorized if fp-instruction has no fast-math property.
 ;
 ; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) {
-;   double sum = 0.                                              
+;   double sum = 0.
 ;   for (int i = 0; i < N; ++i)
 ;     if (x[i] > 0.)
 ;       sum -= x[i];
@ -468,7 +468,7 @@ for.end:                                          ; preds = %for.body, %entry
 }

 ; Float pattern:
-;   Check that is not vectorized if fp-instruction has no fast-math property. 
+;   Check that is not vectorized if fp-instruction has no fast-math property.
 ;
 ; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) {
 ;   float sum = 0.
@ -793,9 +793,10 @@ for.end:                                          ; preds = %for.inc, %entry
 ;     return sum;
 ; }

-; CHECK-LABEL: @fcmp_store_back(
-; CHECK-NOT: <4 x float>
 define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
+; CHECK-LABEL: @fcmp_store_back(
+; CHECK-NOT:     <4 x float>
+;
 entry:
  %cmp7 = icmp sgt i32 %LEN, 0
  br i1 %cmp7, label %for.body.preheader, label %for.end
@ -819,3 +820,6 @@ for.end:                                          ; preds = %for.body, %entry
  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
  ret float %sum.0.lcssa
 }
+
+; Make sure any check-not directives are not triggered by function declarations.
+; CHECK: declare
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@ -504,7 +504,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK:  br i1 true, label %scalar.ph, label %vector.ph

 ; CHECK: middle.block:
-; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
+; CHECK:  %[[v9:.+]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32>
 ; CHECK: scalar.ph:
 ; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
 ; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ]
--- a/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll
@ -214,11 +214,7 @@ define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@ -865,16 +861,8 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[TMP3]], [[RDX_SHUF5]]
-; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <4 x i32> [[BIN_RDX6]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[BIN_RDX8]], i32 0
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@ -1061,11 +1049,7 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP17]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@ -1259,11 +1243,7 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP30:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP20]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]])
 ; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
--- a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@ -27,7 +27,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:    br i1

 ; CHECK-LABEL: middle.block:
-; CHECK:         %rdx.shuf = shufflevector <4 x i32>
+; CHECK:         call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
 define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
 entry:
  %ntrunc = trunc i64 %n to i32
@ -364,7 +364,8 @@ for.end:                                          ; preds = %for.body
 ; variant value stored to uniform address tests that the code gen extracts the
 ; last element from the variant vector and scalar stores it into the uniform
 ; address.
-; CHECK-LABEL: variant_val_store_to_inv_address
+define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @variant_val_store_to_inv_address(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
 ; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
@ -389,20 +390,16 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8, !alias.scope !36
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4, !alias.scope !39, !noalias !36
 ; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@ -418,11 +415,14 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], [[LOOP42:!llvm.loop !.*]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
-define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RDX_LCSSA]]
+;
 entry:
  %ntrunc = trunc i64 %n to i32
  %cmp = icmp eq i32 %ntrunc, %k
@ -591,3 +591,6 @@ bb7:
 bb26:
  ret void
 }
+
+; Make sure any check-not directives are not triggered by function declarations.
+; CHECK: declare
--- a/test/Transforms/LoopVectorize/loop-form.ll
+++ b/test/Transforms/LoopVectorize/loop-form.ll
@ -1092,9 +1092,7 @@ define i32 @me_reduction(i32* %addr) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP5]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 201, 200
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@ -16,8 +16,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK: icmp sgt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp sgt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.smax.v2i32

 define i32 @max_red(i32 %max) {
 entry:
@ -45,8 +44,7 @@ for.end:
 ; CHECK: icmp slt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp sgt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.smax.v2i32

 define i32 @max_red_inverse_select(i32 %max) {
 entry:
@ -73,8 +71,7 @@ for.end:
 ; CHECK: icmp slt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp slt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.smin.v2i32

 define i32 @min_red(i32 %max) {
 entry:
@ -102,8 +99,7 @@ for.end:
 ; CHECK: icmp sgt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp slt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.smin.v2i32

 define i32 @min_red_inverse_select(i32 %max) {
 entry:
@ -132,8 +128,7 @@ for.end:
 ; CHECK: icmp ugt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp ugt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.umax.v2i32

 define i32 @umax_red(i32 %max) {
 entry:
@ -161,8 +156,7 @@ for.end:
 ; CHECK: icmp ult <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp ugt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.umax.v2i32

 define i32 @umax_red_inverse_select(i32 %max) {
 entry:
@ -189,8 +183,7 @@ for.end:
 ; CHECK: icmp ult <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp ult <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.umin.v2i32

 define i32 @umin_red(i32 %max) {
 entry:
@ -218,8 +211,7 @@ for.end:
 ; CHECK: icmp ugt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp ult <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.umin.v2i32

 define i32 @umin_red_inverse_select(i32 %max) {
 entry:
@ -247,8 +239,7 @@ for.end:
 ; CHECK: icmp sge <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp slt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.smin.v2i32

 define i32 @sge_min_red(i32 %max) {
 entry:
@ -276,8 +267,7 @@ for.end:
 ; CHECK: icmp sle <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp sgt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.smax.v2i32

 define i32 @sle_min_red(i32 %max) {
 entry:
@ -305,8 +295,7 @@ for.end:
 ; CHECK: icmp uge <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp ult <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.umin.v2i32

 define i32 @uge_min_red(i32 %max) {
 entry:
@ -334,8 +323,7 @@ for.end:
 ; CHECK: icmp ule <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: icmp ugt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: call i32 @llvm.vector.reduce.umax.v2i32

 define i32 @ule_min_red(i32 %max) {
 entry:
@ -415,8 +403,7 @@ for.end:
 ; CHECK: fcmp fast ogt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @max_red_float(float %max) #0 {
 entry:
@ -441,8 +428,7 @@ for.end:
 ; CHECK: fcmp fast oge <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @max_red_float_ge(float %max) #0 {
 entry:
@ -467,8 +453,7 @@ for.end:
 ; CHECK: fcmp fast olt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @inverted_max_red_float(float %max) #0 {
 entry:
@ -493,8 +478,7 @@ for.end:
 ; CHECK: fcmp fast ole <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @inverted_max_red_float_le(float %max) #0 {
 entry:
@ -519,8 +503,7 @@ for.end:
 ; CHECK: fcmp fast ugt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @unordered_max_red_float(float %max) #0 {
 entry:
@ -545,8 +528,7 @@ for.end:
 ; CHECK: fcmp fast uge <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @unordered_max_red_float_ge(float %max) #0 {
 entry:
@ -571,8 +553,7 @@ for.end:
 ; CHECK: fcmp fast ult <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @inverted_unordered_max_red_float(float %max) #0 {
 entry:
@ -597,8 +578,7 @@ for.end:
 ; CHECK: fcmp fast ule <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast ogt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32

 define float @inverted_unordered_max_red_float_le(float %max) #0 {
 entry:
@ -626,8 +606,7 @@ for.end:
 ; CHECK: fcmp fast olt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @min_red_float(float %min) #0 {
 entry:
@ -652,8 +631,7 @@ for.end:
 ; CHECK: fcmp fast ole <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @min_red_float_le(float %min) #0 {
 entry:
@ -678,8 +656,7 @@ for.end:
 ; CHECK: fcmp fast ogt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @inverted_min_red_float(float %min) #0 {
 entry:
@ -704,8 +681,7 @@ for.end:
 ; CHECK: fcmp fast oge <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @inverted_min_red_float_ge(float %min) #0 {
 entry:
@ -730,8 +706,7 @@ for.end:
 ; CHECK: fcmp fast ult <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @unordered_min_red_float(float %min) #0 {
 entry:
@ -756,8 +731,7 @@ for.end:
 ; CHECK: fcmp fast ule <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @unordered_min_red_float_le(float %min) #0 {
 entry:
@ -782,8 +756,7 @@ for.end:
 ; CHECK: fcmp fast ugt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @inverted_unordered_min_red_float(float %min) #0 {
 entry:
@ -808,8 +781,7 @@ for.end:
 ; CHECK: fcmp fast uge <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x float>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32

 define float @inverted_unordered_min_red_float_ge(float %min) #0 {
 entry:
@ -835,8 +807,7 @@ for.end:
 ; CHECK: fcmp fast olt <2 x double>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-; CHECK: fcmp fast olt <2 x double>
-; CHECK: select fast <2 x i1>
+; CHECK: call fast double @llvm.vector.reduce.fmin.v2f64

 define double @min_red_double(double %min) #0 {
 entry:
@ -881,5 +852,7 @@ for.end:
  ret float %max.red.0
 }

+; Make sure any check-not directives are not triggered by function declarations.
+; CHECK: declare

 attributes #0 = { "no-nans-fp-math"="true" }
--- a/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -dce -instcombine -S | FileCheck %s

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

--- a/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

--- a/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/test/Transforms/LoopVectorize/reduction-inloop.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

--- a/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/test/Transforms/LoopVectorize/reduction-predselect.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -dce -instcombine -S | FileCheck %s

 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@ -6,11 +6,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: add <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  %1 = icmp sgt i32 %n, 0
@ -41,11 +37,7 @@ define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: mul <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: mul <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: mul <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  %1 = icmp sgt i32 %n, 0
@ -76,11 +68,7 @@ define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocap
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: mul nsw <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  %1 = icmp sgt i32 %n, 0
@ -109,11 +97,7 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt

 ;CHECK-LABEL: @reduction_mul(
 ;CHECK: mul <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: mul <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: mul <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
  %1 = icmp sgt i32 %n, 0
@ -143,11 +127,7 @@ define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
 ;CHECK-LABEL: @start_at_non_zero(
 ;CHECK: phi <4 x i32>
 ;CHECK: <i32 120, i32 0, i32 0, i32 0>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
 entry:
@ -176,11 +156,7 @@ for.end:                                          ; preds = %for.body, %entry
 ;CHECK-LABEL: @reduction_and(
 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
 ;CHECK: and <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: and <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: and <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
@ -208,11 +184,7 @@ for.end:                                          ; preds = %for.body, %entry

 ;CHECK-LABEL: @reduction_or(
 ;CHECK: or <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: or <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: or <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
@ -240,11 +212,7 @@ for.end:                                          ; preds = %for.body, %entry

 ;CHECK-LABEL: @reduction_xor(
 ;CHECK: xor <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: xor <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: xor <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
@ -498,11 +466,7 @@ exit:
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: add <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-;CHECK: add <4 x i32>
-;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
 ;CHECK: %sum.copy = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ]
 ;CHECK: ret i32
 define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
@ -577,3 +541,6 @@ entry:
  store i32 %.0.lcssa, i32* %c10, align 4
  ret void
 }
+
+; Make sure any check-not directives are not triggered by function declarations.
+; CHECK: declare
--- a/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/test/Transforms/LoopVectorize/select-reduction.ll
@ -41,13 +41,7 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[RDX_MINMAX_CMP6:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF5]]
-; CHECK-NEXT:    [[RDX_MINMAX_SELECT7:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP6]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT7]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
--- a/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
+++ b/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
@ -21,18 +21,12 @@ define i32 @smaxv6() {
 ; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
 ; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; GFX9-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
-; GFX9-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
 ; GFX9-NEXT:    store i32 [[STORE_SELECT]], i32* @var, align 8
-; GFX9-NEXT:    ret i32 [[OP_EXTRA4]]
+; GFX9-NEXT:    ret i32 [[OP_EXTRA1]]
 ;
  %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
  %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@ -68,18 +62,12 @@ define i64 @sminv6() {
 ; GFX9-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
 ; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
 ; GFX9-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
-; GFX9-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
 ; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
 ; GFX9-NEXT:    store i64 [[STORE_SELECT]], i64* @var64, align 8
-; GFX9-NEXT:    ret i64 [[OP_EXTRA4]]
+; GFX9-NEXT:    ret i64 [[OP_EXTRA1]]
 ;
  %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
  %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
@ -217,18 +205,12 @@ define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
 ; GFX9-NEXT:    [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
 ; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
 ; GFX9-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
 ; GFX9-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
-; GFX9-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
 ; GFX9-NEXT:    [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
 ; GFX9-NEXT:    store i32 [[STOREVAL]], i32* @var, align 8
-; GFX9-NEXT:    ret i32 [[OP_EXTRA4]]
+; GFX9-NEXT:    ret i32 [[OP_EXTRA1]]
 ;
  %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
  %elt1 = extractelement <2 x i32> %vload, i32 0
--- a/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@ -5,11 +5,7 @@
 define half @reduction_half4(<4 x half> %a) {
 ; GFX9-LABEL: @reduction_half4(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]])
 ; GFX9-NEXT:    ret half [[TMP0]]
 ;
 ; VI-LABEL: @reduction_half4(
@ -39,13 +35,7 @@ entry:
 define half @reduction_half8(<8 x half> %vec8) {
 ; GFX9-LABEL: @reduction_half8(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]])
 ; GFX9-NEXT:    ret half [[TMP0]]
 ;
 ; VI-LABEL: @reduction_half8(
@ -91,15 +81,7 @@ entry:
 define half @reduction_half16(<16 x half> %vec16) {
 ; GFX9-LABEL: @reduction_half16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> poison, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
-; GFX9-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[VEC16:%.*]])
 ; GFX9-NEXT:    ret half [[TMP0]]
 ;
 ; VI-LABEL: @reduction_half16(
@ -203,11 +185,7 @@ entry:
 define i16 @reduction_v4i16(<4 x i16> %a) {
 ; GFX9-LABEL: @reduction_v4i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_v4i16(
@ -237,13 +215,7 @@ entry:
 define i16 @reduction_v8i16(<8 x i16> %vec8) {
 ; GFX9-LABEL: @reduction_v8i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_v8i16(
@ -289,13 +261,7 @@ entry:
 define i16 @reduction_umin_v4i16(<4 x i16> %vec4) {
 ; GFX9-LABEL: @reduction_umin_v4i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[VEC4:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_umin_v4i16(
@ -331,16 +297,7 @@ entry:
 define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) {
 ; GFX9-LABEL: @reduction_icmp_v8i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[VEC8:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_icmp_v8i16(
@ -402,19 +359,7 @@ entry:
 define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
 ; GFX9-LABEL: @reduction_smin_v16i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> poison, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]]
-; GFX9-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> [[VEC16:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_smin_v16i16(
@ -530,13 +475,7 @@ entry:
 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
 ; GFX9-LABEL: @reduction_umax_v4i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[VEC4:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_umax_v4i16(
@ -572,13 +511,7 @@ entry:
 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) {
 ; GFX9-LABEL: @reduction_smax_v4i16(
 ; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
-; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
-; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
-; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[VEC4:%.*]])
 ; GFX9-NEXT:    ret i16 [[TMP0]]
 ;
 ; VI-LABEL: @reduction_smax_v4i16(