1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[Vectorizers][TTI] remove option to bypass creation of vector reduction intrinsics

The vector reduction intrinsics started life as experimental ops, so backend support
was lacking. As part of promoting them to 1st-class intrinsics, however, codegen
support was added/improved:
D58015
D90247

So I think it is safe to now remove this complication from IR.

Note that we still have an IR-level codegen expansion pass for these as discussed
in D95690. Removing that is another step in simplifying the logic. Also note that
x86 was already unconditionally forming reductions in IR, so there should be no
difference for x86.

I spot checked a couple of the tests here by running them through opt+llc and did
not see any asm diffs.

If we do find functional differences for other targets, it should be possible
to (at least temporarily) restore the shuffle IR with the ExpandReductions IR
pass.

Differential Revision: https://reviews.llvm.org/D96552
This commit is contained in:
Sanjay Patel 2021-02-12 08:07:29 -05:00
parent b4a7561ae7
commit d1a8bb697a
33 changed files with 120 additions and 391 deletions

View File

@ -1326,11 +1326,6 @@ public:
bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present.
};
/// \returns True if the target wants to handle the given reduction idiom in
/// the intrinsics form instead of the shuffle form.
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const;
/// \returns True if the target prefers reductions in loop.
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const;
@ -1652,8 +1647,6 @@ public:
virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const = 0;
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
@ -2183,10 +2176,6 @@ public:
VectorType *VecTy) const override {
return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const override {
return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
}
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const override {
return Impl.preferInLoopReduction(Opcode, Ty, Flags);

View File

@ -700,11 +700,6 @@ public:
return VF;
}
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return false;
}
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return false;

View File

@ -1050,11 +1050,6 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const {
return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
}
bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const {
return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags);

View File

@ -1089,31 +1089,6 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
return Considerable;
}
bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
auto *VTy = cast<VectorType>(Ty);
unsigned ScalarBits = Ty->getScalarSizeInBits();
switch (Opcode) {
case Instruction::FAdd:
case Instruction::FMul:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::Mul:
return false;
case Instruction::Add:
return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
case Instruction::ICmp:
return (ScalarBits < 64) &&
(ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
case Instruction::FCmp:
return Flags.NoNaN;
default:
llvm_unreachable("Unhandled reduction opcode");
}
return false;
}
int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
bool IsPairwise, bool IsUnsigned,
TTI::TargetCostKind CostKind) {

View File

@ -266,9 +266,6 @@ public:
bool supportsScalableVectors() const { return ST->hasSVE(); }
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
bool IsPairwiseForm,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);

View File

@ -2091,11 +2091,6 @@ void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return ST->hasMVEIntegerOps();
}
bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
if (!ST->hasMVEIntegerOps())

View File

@ -186,9 +186,6 @@ public:
int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
VectorType *SubTp);
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;

View File

@ -231,14 +231,6 @@ public:
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
/// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded
/// into shuffles and vector math/logic by the backend
/// (see TTI::shouldExpandReduction)
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return true;
}
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
Align Alignment, unsigned AddressSpace);

View File

@ -54,11 +54,6 @@
using namespace llvm;
using namespace llvm::PatternMatch;
static cl::opt<bool> ForceReductionIntrinsic(
"force-reduction-intrinsics", cl::Hidden,
cl::desc("Force creating reduction intrinsics for testing."),
cl::init(false));
#define DEBUG_TYPE "loop-utils"
static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
@ -1025,14 +1020,10 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
const TargetTransformInfo *TTI,
Value *Src, RecurKind RdxKind,
ArrayRef<Value *> RedOps) {
unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind);
TargetTransformInfo::ReductionFlags RdxFlags;
RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax ||
RdxKind == RecurKind::FMax;
RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin;
if (!ForceReductionIntrinsic &&
!TTI->useReductionIntrinsic(Opcode, Src->getType(), RdxFlags))
return getShuffleReduction(Builder, Src, Opcode, RdxKind, RedOps);
auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
switch (RdxKind) {

View File

@ -5,10 +5,9 @@ target triple = "aarch64--linux-gnu"
@b = common local_unnamed_addr global i32 0, align 4
@a = common local_unnamed_addr global i16* null, align 8
; Function Attrs: norecurse nounwind readonly
define i32 @fn1() local_unnamed_addr #0 {
; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
; CHECK-NOT: @llvm.vector.reduce
; We expect the backend to expand all reductions.
; CHECK: @llvm.vector.reduce
entry:
%0 = load i32, i32* @b, align 4, !tbaa !1
%cmp40 = icmp sgt i32 %0, 0

View File

@ -62,9 +62,7 @@ define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
; GFX9-NEXT: [[BIN_RDX18:%.*]] = fadd fast <2 x half> [[TMP21]], [[BIN_RDX17]]
; GFX9-NEXT: [[BIN_RDX19:%.*]] = fadd fast <2 x half> [[TMP22]], [[BIN_RDX18]]
; GFX9-NEXT: [[BIN_RDX20:%.*]] = fadd fast <2 x half> [[TMP23]], [[BIN_RDX19]]
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x half> [[BIN_RDX20]], <2 x half> poison, <2 x i32> <i32 1, i32 undef>
; GFX9-NEXT: [[BIN_RDX21:%.*]] = fadd fast <2 x half> [[BIN_RDX20]], [[RDX_SHUF]]
; GFX9-NEXT: [[TMP25:%.*]] = extractelement <2 x half> [[BIN_RDX21]], i32 0
; GFX9-NEXT: [[TMP25:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX20]])
; GFX9-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; GFX9: scalar.ph:
; GFX9-NEXT: br label [[FOR_BODY:%.*]]
@ -132,9 +130,7 @@ define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
; VI-NEXT: [[BIN_RDX18:%.*]] = fadd fast <2 x half> [[TMP21]], [[BIN_RDX17]]
; VI-NEXT: [[BIN_RDX19:%.*]] = fadd fast <2 x half> [[TMP22]], [[BIN_RDX18]]
; VI-NEXT: [[BIN_RDX20:%.*]] = fadd fast <2 x half> [[TMP23]], [[BIN_RDX19]]
; VI-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x half> [[BIN_RDX20]], <2 x half> poison, <2 x i32> <i32 1, i32 undef>
; VI-NEXT: [[BIN_RDX21:%.*]] = fadd fast <2 x half> [[BIN_RDX20]], [[RDX_SHUF]]
; VI-NEXT: [[TMP25:%.*]] = extractelement <2 x half> [[BIN_RDX21]], i32 0
; VI-NEXT: [[TMP25:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX20]])
; VI-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; VI: scalar.ph:
; VI-NEXT: br label [[FOR_BODY:%.*]]

View File

@ -67,9 +67,7 @@ define i32 @test(float* nocapture readonly %x) {
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
; CHECK: scalar.ph:

View File

@ -22,9 +22,7 @@ define dso_local double @test(float* %Arr) {
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[DOTLCSSA]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[DOTLCSSA]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
; CHECK-NEXT: ret double [[TMP7]]
;
entry:

View File

@ -20,9 +20,7 @@ define dso_local double @test(float* %Arr) {
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP5]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP5]])
; CHECK-NEXT: ret double [[TMP7]]
;
entry:

View File

@ -14,8 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK: add i64 %index, 2, !dbg ![[LOC]]
; CHECK: icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
; CHECK: middle.block
; CHECK: add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[BR_LOC:[0-9]+]]
; CHECK: extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[BR_LOC]]
; CHECK: call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %{{.*}}), !dbg ![[BR_LOC:[0-9]+]]
; CHECK: for.body
; CHECK: br i1{{.*}}, label %for.body,{{.*}}, !dbg ![[BR_LOC]],
; CHECK: ![[BR_LOC]] = !DILocation(line: 5,

View File

@ -143,7 +143,7 @@ scalar.body:
; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; Check also that the casts were not moved needlessly.
; CHECK: sitofp <4 x i16> [[L1]] to <4 x double>
; CHECK: sitofp <4 x i16> [[SHUF]] to <4 x double>
; CHECK: sitofp <4 x i16> [[SHUF]] to <4 x double>
; CHECK: middle.block:
; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
; CHECK: scalar.ph:
@ -357,8 +357,8 @@ for.end:
}
; We vectorize this first order recurrence, by generating two
; extracts for the phi `val.phi` - one at the last index and
; another at the second last index. We need these 2 extracts because
; extracts for the phi `val.phi` - one at the last index and
; another at the second last index. We need these 2 extracts because
; the first order recurrence phi is used outside the loop, so we require the phi
; itself and not its update (addx).
; UNROLL-NO-IC-LABEL: extract_second_last_iteration
@ -705,16 +705,12 @@ define i32 @sink_into_replication_region(i32 %y) {
; CHECK-NEXT: [[TMP21]] = phi <4 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP20]], [[PRED_UDIV_IF8]] ]
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP21]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP24]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF10:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF10]]
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[BIN_RDX11]], i32 0
; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[BB2:%.*]]
@ -834,17 +830,13 @@ define i32 @sink_into_replication_region_multiple(i32 *%x, i32 %y) {
; CHECK-NEXT: store i32 [[TMP4]], i32* [[TMP38]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]]
; CHECK: pred.store.continue16:
; CHECK-NEXT: [[TMP39:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP39]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF17:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX18:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF17]]
; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[BIN_RDX18]], i32 0
; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]])
; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[BB2:%.*]]

View File

@ -7,11 +7,7 @@
; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL:[0-9]+]]
; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= shufflevector <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= shufflevector <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= extractelement <4 x i32>{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{.*}}), !dbg ![[DL]]
; CHECK-NEXT: %{{.*}}= icmp eq i64{{.*}}, !dbg ![[DL]]
; CHECK-NEXT: br i1 %{{.*}}, !dbg ![[DL]]
; CHECK: ![[DL]] = !DILocation(line: 5,

View File

@ -56,8 +56,7 @@ define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
; CHECK: load <4 x float>
; CHECK: fadd fast <4 x float>
; CHECK: br
; CHECK: fadd fast <4 x float>
; CHECK: fadd fast <4 x float>
; CHECK: call fast float @llvm.vector.reduce.fadd.v4f32
define float @fast_math(float* noalias %s) {
entry:
br label %for.body

View File

@ -68,13 +68,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 {
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <4 x float> [[TMP5]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536
; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:

View File

@ -387,10 +387,10 @@ for.end: ; preds = %for.body, %entry
}
; Double pattern:
; Check that is not vectorized if fp-instruction has no fast-math property.
; Check that is not vectorized if fp-instruction has no fast-math property.
;
; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) {
; double sum = 0.
; double sum = 0.
; for (int i = 0; i < N; ++i)
; if (x[i] > 0.)
; sum -= x[i];
@ -468,7 +468,7 @@ for.end: ; preds = %for.body, %entry
}
; Float pattern:
; Check that is not vectorized if fp-instruction has no fast-math property.
; Check that is not vectorized if fp-instruction has no fast-math property.
;
; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) {
; float sum = 0.
@ -793,9 +793,10 @@ for.end: ; preds = %for.inc, %entry
; return sum;
; }
; CHECK-LABEL: @fcmp_store_back(
; CHECK-NOT: <4 x float>
define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
; CHECK-LABEL: @fcmp_store_back(
; CHECK-NOT: <4 x float>
;
entry:
%cmp7 = icmp sgt i32 %LEN, 0
br i1 %cmp7, label %for.body.preheader, label %for.end
@ -819,3 +820,6 @@ for.end: ; preds = %for.body, %entry
%sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
ret float %sum.0.lcssa
}
; Make sure any check-not directives are not triggered by function declarations.
; CHECK: declare

View File

@ -504,7 +504,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
; CHECK: br i1 true, label %scalar.ph, label %vector.ph
; CHECK: middle.block:
; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
; CHECK: %[[v9:.+]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32>
; CHECK: scalar.ph:
; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ]

View File

@ -214,11 +214,7 @@ define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
@ -865,16 +861,8 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <4 x i32> [[TMP3]], [[RDX_SHUF5]]
; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <4 x i32> [[BIN_RDX6]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[BIN_RDX8]], i32 0
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP4]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
@ -1061,11 +1049,7 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP17]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@ -1259,11 +1243,7 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP30:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP20]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0
; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]])
; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

View File

@ -27,7 +27,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: br i1
; CHECK-LABEL: middle.block:
; CHECK: %rdx.shuf = shufflevector <4 x i32>
; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
entry:
%ntrunc = trunc i64 %n to i32
@ -364,7 +364,8 @@ for.end: ; preds = %for.body
; variant value stored to uniform address tests that the code gen extracts the
; last element from the variant vector and scalar stores it into the uniform
; address.
; CHECK-LABEL: variant_val_store_to_inv_address
define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
; CHECK-LABEL: @variant_val_store_to_inv_address(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
@ -389,20 +390,16 @@ for.end: ; preds = %for.body
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8, !alias.scope !36
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4
; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4, !alias.scope !39, !noalias !36
; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
@ -418,11 +415,14 @@ for.end: ; preds = %for.body
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], [[LOOP42:!llvm.loop !.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
; CHECK-NEXT: br label [[FOR_END]]
define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
; CHECK: for.end:
; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ]
; CHECK-NEXT: ret i32 [[RDX_LCSSA]]
;
entry:
%ntrunc = trunc i64 %n to i32
%cmp = icmp eq i32 %ntrunc, %k
@ -591,3 +591,6 @@ bb7:
bb26:
ret void
}
; Make sure any check-not directives are not triggered by function declarations.
; CHECK: declare

View File

@ -1092,9 +1092,7 @@ define i32 @me_reduction(i32* %addr) {
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP5]], [[RDX_SHUF]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[BIN_RDX]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP5]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 201, 200
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:

View File

@ -16,8 +16,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK: icmp sgt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp sgt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.smax.v2i32
define i32 @max_red(i32 %max) {
entry:
@ -45,8 +44,7 @@ for.end:
; CHECK: icmp slt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp sgt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.smax.v2i32
define i32 @max_red_inverse_select(i32 %max) {
entry:
@ -73,8 +71,7 @@ for.end:
; CHECK: icmp slt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp slt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.smin.v2i32
define i32 @min_red(i32 %max) {
entry:
@ -102,8 +99,7 @@ for.end:
; CHECK: icmp sgt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp slt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.smin.v2i32
define i32 @min_red_inverse_select(i32 %max) {
entry:
@ -132,8 +128,7 @@ for.end:
; CHECK: icmp ugt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp ugt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.umax.v2i32
define i32 @umax_red(i32 %max) {
entry:
@ -161,8 +156,7 @@ for.end:
; CHECK: icmp ult <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp ugt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.umax.v2i32
define i32 @umax_red_inverse_select(i32 %max) {
entry:
@ -189,8 +183,7 @@ for.end:
; CHECK: icmp ult <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp ult <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.umin.v2i32
define i32 @umin_red(i32 %max) {
entry:
@ -218,8 +211,7 @@ for.end:
; CHECK: icmp ugt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp ult <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.umin.v2i32
define i32 @umin_red_inverse_select(i32 %max) {
entry:
@ -247,8 +239,7 @@ for.end:
; CHECK: icmp sge <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp slt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.smin.v2i32
define i32 @sge_min_red(i32 %max) {
entry:
@ -276,8 +267,7 @@ for.end:
; CHECK: icmp sle <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp sgt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.smax.v2i32
define i32 @sle_min_red(i32 %max) {
entry:
@ -305,8 +295,7 @@ for.end:
; CHECK: icmp uge <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp ult <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.umin.v2i32
define i32 @uge_min_red(i32 %max) {
entry:
@ -334,8 +323,7 @@ for.end:
; CHECK: icmp ule <2 x i32>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: icmp ugt <2 x i32>
; CHECK: select <2 x i1>
; CHECK: call i32 @llvm.vector.reduce.umax.v2i32
define i32 @ule_min_red(i32 %max) {
entry:
@ -415,8 +403,7 @@ for.end:
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @max_red_float(float %max) #0 {
entry:
@ -441,8 +428,7 @@ for.end:
; CHECK: fcmp fast oge <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @max_red_float_ge(float %max) #0 {
entry:
@ -467,8 +453,7 @@ for.end:
; CHECK: fcmp fast olt <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @inverted_max_red_float(float %max) #0 {
entry:
@ -493,8 +478,7 @@ for.end:
; CHECK: fcmp fast ole <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @inverted_max_red_float_le(float %max) #0 {
entry:
@ -519,8 +503,7 @@ for.end:
; CHECK: fcmp fast ugt <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @unordered_max_red_float(float %max) #0 {
entry:
@ -545,8 +528,7 @@ for.end:
; CHECK: fcmp fast uge <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @unordered_max_red_float_ge(float %max) #0 {
entry:
@ -571,8 +553,7 @@ for.end:
; CHECK: fcmp fast ult <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @inverted_unordered_max_red_float(float %max) #0 {
entry:
@ -597,8 +578,7 @@ for.end:
; CHECK: fcmp fast ule <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32
define float @inverted_unordered_max_red_float_le(float %max) #0 {
entry:
@ -626,8 +606,7 @@ for.end:
; CHECK: fcmp fast olt <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @min_red_float(float %min) #0 {
entry:
@ -652,8 +631,7 @@ for.end:
; CHECK: fcmp fast ole <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @min_red_float_le(float %min) #0 {
entry:
@ -678,8 +656,7 @@ for.end:
; CHECK: fcmp fast ogt <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @inverted_min_red_float(float %min) #0 {
entry:
@ -704,8 +681,7 @@ for.end:
; CHECK: fcmp fast oge <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @inverted_min_red_float_ge(float %min) #0 {
entry:
@ -730,8 +706,7 @@ for.end:
; CHECK: fcmp fast ult <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @unordered_min_red_float(float %min) #0 {
entry:
@ -756,8 +731,7 @@ for.end:
; CHECK: fcmp fast ule <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @unordered_min_red_float_le(float %min) #0 {
entry:
@ -782,8 +756,7 @@ for.end:
; CHECK: fcmp fast ugt <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @inverted_unordered_min_red_float(float %min) #0 {
entry:
@ -808,8 +781,7 @@ for.end:
; CHECK: fcmp fast uge <2 x float>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x float>
; CHECK: select fast <2 x i1>
; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32
define float @inverted_unordered_min_red_float_ge(float %min) #0 {
entry:
@ -835,8 +807,7 @@ for.end:
; CHECK: fcmp fast olt <2 x double>
; CHECK: select <2 x i1>
; CHECK: middle.block
; CHECK: fcmp fast olt <2 x double>
; CHECK: select fast <2 x i1>
; CHECK: call fast double @llvm.vector.reduce.fmin.v2f64
define double @min_red_double(double %min) #0 {
entry:
@ -881,5 +852,7 @@ for.end:
ret float %max.red.0
}
; Make sure any check-not directives are not triggered by function declarations.
; CHECK: declare
attributes #0 = { "no-nans-fp-math"="true" }

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -dce -instcombine -S | FileCheck %s
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

View File

@ -6,11 +6,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
;CHECK: phi <4 x i32>
;CHECK: load <4 x i32>
;CHECK: add <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
%1 = icmp sgt i32 %n, 0
@ -41,11 +37,7 @@ define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
;CHECK: phi <4 x i32>
;CHECK: load <4 x i32>
;CHECK: mul <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: mul <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: mul <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
%1 = icmp sgt i32 %n, 0
@ -76,11 +68,7 @@ define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocap
;CHECK: phi <4 x i32>
;CHECK: load <4 x i32>
;CHECK: mul nsw <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
%1 = icmp sgt i32 %n, 0
@ -109,11 +97,7 @@ define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
;CHECK-LABEL: @reduction_mul(
;CHECK: mul <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: mul <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: mul <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
%1 = icmp sgt i32 %n, 0
@ -143,11 +127,7 @@ define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapt
;CHECK-LABEL: @start_at_non_zero(
;CHECK: phi <4 x i32>
;CHECK: <i32 120, i32 0, i32 0, i32 0>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
entry:
@ -176,11 +156,7 @@ for.end: ; preds = %for.body, %entry
;CHECK-LABEL: @reduction_and(
;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
;CHECK: and <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: and <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: and <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
entry:
@ -208,11 +184,7 @@ for.end: ; preds = %for.body, %entry
;CHECK-LABEL: @reduction_or(
;CHECK: or <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: or <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: or <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
entry:
@ -240,11 +212,7 @@ for.end: ; preds = %for.body, %entry
;CHECK-LABEL: @reduction_xor(
;CHECK: xor <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: xor <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: xor <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>
;CHECK: ret i32
define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
entry:
@ -498,11 +466,7 @@ exit:
;CHECK: phi <4 x i32>
;CHECK: load <4 x i32>
;CHECK: add <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
;CHECK: add <4 x i32>
;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32>
;CHECK: %sum.copy = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ]
;CHECK: ret i32
define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
@ -577,3 +541,6 @@ entry:
store i32 %.0.lcssa, i32* %c10, align 4
ret void
}
; Make sure any check-not directives are not triggered by function declarations.
; CHECK: declare

View File

@ -41,13 +41,7 @@ define i32 @test(i64 %N, i32 %x) {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[RDX_MINMAX_CMP6:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF5]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT7:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP6]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF5]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT7]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]

View File

@ -21,18 +21,12 @@ define i32 @smaxv6() {
; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8
; GFX9-NEXT: ret i32 [[OP_EXTRA4]]
; GFX9-NEXT: ret i32 [[OP_EXTRA1]]
;
%load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@ -68,18 +62,12 @@ define i64 @sminv6() {
; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8
; GFX9-NEXT: ret i64 [[OP_EXTRA4]]
; GFX9-NEXT: ret i64 [[OP_EXTRA1]]
;
%load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
%load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
@ -217,18 +205,12 @@ define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8
; GFX9-NEXT: ret i32 [[OP_EXTRA4]]
; GFX9-NEXT: ret i32 [[OP_EXTRA1]]
;
%vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
%elt1 = extractelement <2 x i32> %vload, i32 0

View File

@ -5,11 +5,7 @@
define half @reduction_half4(<4 x half> %a) {
; GFX9-LABEL: @reduction_half4(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]])
; GFX9-NEXT: ret half [[TMP0]]
;
; VI-LABEL: @reduction_half4(
@ -39,13 +35,7 @@ entry:
define half @reduction_half8(<8 x half> %vec8) {
; GFX9-LABEL: @reduction_half8(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]])
; GFX9-NEXT: ret half [[TMP0]]
;
; VI-LABEL: @reduction_half8(
@ -91,15 +81,7 @@ entry:
define half @reduction_half16(<16 x half> %vec16) {
; GFX9-LABEL: @reduction_half16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> poison, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
; GFX9-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[VEC16:%.*]])
; GFX9-NEXT: ret half [[TMP0]]
;
; VI-LABEL: @reduction_half16(
@ -203,11 +185,7 @@ entry:
define i16 @reduction_v4i16(<4 x i16> %a) {
; GFX9-LABEL: @reduction_v4i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_v4i16(
@ -237,13 +215,7 @@ entry:
define i16 @reduction_v8i16(<8 x i16> %vec8) {
; GFX9-LABEL: @reduction_v8i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_v8i16(
@ -289,13 +261,7 @@ entry:
define i16 @reduction_umin_v4i16(<4 x i16> %vec4) {
; GFX9-LABEL: @reduction_umin_v4i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[VEC4:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_umin_v4i16(
@ -331,16 +297,7 @@ entry:
define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) {
; GFX9-LABEL: @reduction_icmp_v8i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[VEC8:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_icmp_v8i16(
@ -402,19 +359,7 @@ entry:
define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
; GFX9-LABEL: @reduction_smin_v16i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> poison, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]]
; GFX9-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> [[VEC16:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_smin_v16i16(
@ -530,13 +475,7 @@ entry:
define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
; GFX9-LABEL: @reduction_umax_v4i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[VEC4:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_umax_v4i16(
@ -572,13 +511,7 @@ entry:
define i16 @reduction_smax_v4i16(<4 x i16> %vec4) {
; GFX9-LABEL: @reduction_smax_v4i16(
; GFX9-NEXT: entry:
; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[VEC4:%.*]])
; GFX9-NEXT: ret i16 [[TMP0]]
;
; VI-LABEL: @reduction_smax_v4i16(