1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[ARM] MVE saturating truncates

This adds some custom lowering for VQMOVN, an instruction that can be
used to perform saturating truncates from a pair of min(max(X, -0x8000),
0x7fff), providing those constants are correct. This leaves a VQMOVNBs
which saturates the value and inserts that into the bottom lanes of an
existing vector. We then need to do something with the other lanes,
extending the value using a vmovlb.

Ideally, as will often be the case, only the bottom lane of what remains
will be demanded, allowing the vmovlb to be removed. Which should mean
the instruction is either equal or a win most of the time, and allows
some extra follow-up folding to happen.

Differential Revision: https://reviews.llvm.org/D77590
This commit is contained in:
David Green 2020-05-16 14:54:33 +01:00
parent 5658bcf0c4
commit 4120e7a927
7 changed files with 743 additions and 699 deletions

View File

@ -946,6 +946,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::BITCAST);
}
if (Subtarget->hasMVEIntegerOps()) {
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMAX);
}
if (!Subtarget->hasFP64()) {
// When targeting a floating-point unit with only single-precision
@ -1668,6 +1674,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VTBL1: return "ARMISD::VTBL1";
case ARMISD::VTBL2: return "ARMISD::VTBL2";
case ARMISD::VMOVN: return "ARMISD::VMOVN";
case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs";
case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu";
case ARMISD::VMULLs: return "ARMISD::VMULLs";
case ARMISD::VMULLu: return "ARMISD::VMULLu";
case ARMISD::VADDVs: return "ARMISD::VADDVs";
@ -14864,6 +14872,107 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
/// saturates.
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
const ARMSubtarget *ST) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
if (!ST->hasMVEIntegerOps())
return SDValue();
if (VT != MVT::v4i32 && VT != MVT::v8i16)
return SDValue();
auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
// Check one is a smin and the other is a smax
if (Min->getOpcode() != ISD::SMIN)
std::swap(Min, Max);
if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
return false;
APInt SaturateC;
if (VT == MVT::v4i32)
SaturateC = APInt(32, (1 << 15) - 1, true);
else //if (VT == MVT::v8i16)
SaturateC = APInt(16, (1 << 7) - 1, true);
APInt MinC, MaxC;
if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
MinC != SaturateC)
return false;
if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
MaxC != ~SaturateC)
return false;
return true;
};
if (IsSignedSaturate(N, N0.getNode())) {
SDLoc DL(N);
MVT ExtVT, HalfVT;
if (VT == MVT::v4i32) {
HalfVT = MVT::v8i16;
ExtVT = MVT::v4i16;
} else { // if (VT == MVT::v8i16)
HalfVT = MVT::v16i8;
ExtVT = MVT::v8i8;
}
// Create a VQMOVNB with undef top lanes, then signed extended into the top
// half. That extend will hopefully be removed if only the bottom bits are
// demanded (though a truncating store, for example).
SDValue VQMOVN =
DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
DAG.getValueType(ExtVT));
}
auto IsUnsignedSaturate = [&](SDNode *Min) {
// For unsigned, we just need to check for <= 0xffff
if (Min->getOpcode() != ISD::UMIN)
return false;
APInt SaturateC;
if (VT == MVT::v4i32)
SaturateC = APInt(32, (1 << 16) - 1, true);
else //if (VT == MVT::v8i16)
SaturateC = APInt(16, (1 << 8) - 1, true);
APInt MinC;
if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
MinC != SaturateC)
return false;
return true;
};
if (IsUnsignedSaturate(N)) {
SDLoc DL(N);
MVT HalfVT;
unsigned ExtConst;
if (VT == MVT::v4i32) {
HalfVT = MVT::v8i16;
ExtConst = 0x0000FFFF;
} else { //if (VT == MVT::v8i16)
HalfVT = MVT::v16i8;
ExtConst = 0x00FF;
}
// Create a VQMOVNB with undef top lanes, then ZExt into the top half with
// an AND. That extend will hopefully be removed if only the bottom bits are
// demanded (though a truncating store, for example).
SDValue VQMOVN =
DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
DAG.getConstant(0, DL, MVT::i32));
SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
return DAG.getNode(ISD::AND, DL, VT, Bitcast,
DAG.getConstant(ExtConst, DL, VT));
}
return SDValue();
}
static const APInt *isPowerOf2Constant(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
if (!C)
@ -15419,7 +15528,13 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformShiftCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ISD::ANY_EXTEND:
return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ISD::SMIN:
case ISD::UMIN:
case ISD::SMAX:
case ISD::UMAX:
return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
case ISD::LOAD: return PerformLOADCombine(N, DCI);

View File

@ -204,6 +204,10 @@ class VectorType;
VTBL2, // 2-register shuffle with mask
VMOVN, // MVE vmovn
// MVE Saturating truncates
VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
// Vector multiply long:
VMULLs, // ...signed
VMULLu, // ...unsigned

View File

@ -4714,6 +4714,31 @@ defm : MVE_VQMOVN_p<MVE_VQMOVUNs32th, 1, 0, 1, MVE_v8i16, MVE_v4i32>;
defm : MVE_VQMOVN_p<MVE_VQMOVUNs16bh, 1, 0, 0, MVE_v16i8, MVE_v8i16>;
defm : MVE_VQMOVN_p<MVE_VQMOVUNs16th, 1, 0, 1, MVE_v16i8, MVE_v8i16>;
def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisVec<2>, SDTCisVT<3, i32>]>;
def MVEvqmovns : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>;
def MVEvqmovnu : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>;
let Predicates = [HasMVEInt] in {
def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))),
(v8i16 (MVE_VQMOVNs32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))),
(v8i16 (MVE_VQMOVNs32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
(v16i8 (MVE_VQMOVNs16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
(v16i8 (MVE_VQMOVNs16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))),
(v8i16 (MVE_VQMOVNu32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))),
(v8i16 (MVE_VQMOVNu32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
(v16i8 (MVE_VQMOVNu16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
(v16i8 (MVE_VQMOVNu16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
}
class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
dag iops_extra, vpred_ops vpred, string cstr>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),

File diff suppressed because it is too large Load Diff

View File

@ -4,10 +4,8 @@
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_t1(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_sminmax_t1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i32 q2, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q2
; CHECK-NEXT: vmov.i32 q2, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q2
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovnt.i32 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -24,10 +22,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_t2(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_sminmax_t2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i32 q2, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q2
; CHECK-NEXT: vmov.i32 q2, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q2
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovnt.i32 q0, q1
; CHECK-NEXT: bx lr
entry:
@ -43,10 +39,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_b1(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_sminmax_b1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i32 q2, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q2
; CHECK-NEXT: vmov.i32 q2, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q2
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovnb.i32 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -63,10 +57,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_b2(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_sminmax_b2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i32 q2, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q2
; CHECK-NEXT: vmov.i32 q2, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q2
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovnb.i32 q0, q1
; CHECK-NEXT: bx lr
entry:
@ -83,8 +75,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_t1(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_uminmax_t1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q2
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmovnt.i32 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -99,8 +91,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_t2(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_uminmax_t2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q2
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmovnt.i32 q0, q1
; CHECK-NEXT: bx lr
entry:
@ -114,8 +106,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_b1(<4 x i32> %s0, <8 x i16> %src1) {
; CHECK-LABEL: vqmovni32_uminmax_b1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q2
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmovnb.i32 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -145,10 +137,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_t1(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_sminmax_t1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i16 q2, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q2
; CHECK-NEXT: vmov.i16 q2, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q2
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovnt.i16 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -165,10 +155,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_t2(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_sminmax_t2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i16 q2, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q2
; CHECK-NEXT: vmov.i16 q2, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q2
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovnt.i16 q0, q1
; CHECK-NEXT: bx lr
entry:
@ -184,10 +172,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_b1(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_sminmax_b1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i16 q2, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q2
; CHECK-NEXT: vmov.i16 q2, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q2
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovnb.i16 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -204,10 +190,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_b2(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_sminmax_b2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i16 q2, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q2
; CHECK-NEXT: vmov.i16 q2, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q2
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovnb.i16 q0, q1
; CHECK-NEXT: bx lr
entry:
@ -224,8 +208,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_t1(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_uminmax_t1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q2, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q2
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmovnt.i16 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -240,8 +224,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_t2(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_uminmax_t2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q2, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q2
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmovnt.i16 q0, q1
; CHECK-NEXT: bx lr
entry:
@ -255,8 +239,8 @@ entry:
define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_b1(<8 x i16> %s0, <16 x i8> %src1) {
; CHECK-LABEL: vqmovni16_uminmax_b1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q2, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q2
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmovnb.i16 q1, q0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr

View File

@ -4,10 +4,8 @@
define arm_aapcs_vfpcc <4 x i32> @vqmovni32_smaxmin(<4 x i32> %s0) {
; CHECK-LABEL: vqmovni32_smaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q1
; CHECK-NEXT: vmvn.i32 q1, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q1
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp slt <4 x i32> %s0, <i32 32767, i32 32767, i32 32767, i32 32767>
@ -20,10 +18,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vqmovni32_sminmax(<4 x i32> %s0) {
; CHECK-LABEL: vqmovni32_sminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i32 q1, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q1
; CHECK-NEXT: vmov.i32 q1, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q1
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp sgt <4 x i32> %s0, <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
@ -36,8 +32,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vqmovni32_umaxmin(<4 x i32> %s0) {
; CHECK-LABEL: vqmovni32_umaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q1
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp ult <4 x i32> %s0, <i32 65535, i32 65535, i32 65535, i32 65535>
@ -48,8 +44,8 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @vqmovni32_uminmax(<4 x i32> %s0) {
; CHECK-LABEL: vqmovni32_uminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q1
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: bx lr
entry:
%c2 = icmp ult <4 x i32> %s0, <i32 65535, i32 65535, i32 65535, i32 65535>
@ -60,10 +56,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni16_smaxmin(<8 x i16> %s0) {
; CHECK-LABEL: vqmovni16_smaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q1, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q1
; CHECK-NEXT: vmvn.i16 q1, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q1
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp slt <8 x i16> %s0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
@ -76,10 +70,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni16_sminmax(<8 x i16> %s0) {
; CHECK-LABEL: vqmovni16_sminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmvn.i16 q1, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q1
; CHECK-NEXT: vmov.i16 q1, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q1
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp sgt <8 x i16> %s0, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
@ -92,8 +84,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni16_umaxmin(<8 x i16> %s0) {
; CHECK-LABEL: vqmovni16_umaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q1, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q1
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp ult <8 x i16> %s0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@ -104,8 +96,8 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @vqmovni16_uminmax(<8 x i16> %s0) {
; CHECK-LABEL: vqmovni16_uminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i16 q1, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q1
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: bx lr
entry:
%c2 = icmp ult <8 x i16> %s0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>

View File

@ -5,10 +5,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqshrni32_smaxmin(<4 x i32> %so) {
; CHECK-LABEL: vqshrni32_smaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s32 q0, q0, #3
; CHECK-NEXT: vmov.i32 q1, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q1
; CHECK-NEXT: vmvn.i32 q1, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q1
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = ashr <4 x i32> %so, <i32 3, i32 3, i32 3, i32 3>
@ -23,10 +21,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqshrni32_sminmax(<4 x i32> %so) {
; CHECK-LABEL: vqshrni32_sminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s32 q0, q0, #3
; CHECK-NEXT: vmvn.i32 q1, #0x7fff
; CHECK-NEXT: vmax.s32 q0, q0, q1
; CHECK-NEXT: vmov.i32 q1, #0x7fff
; CHECK-NEXT: vmin.s32 q0, q0, q1
; CHECK-NEXT: vqmovnb.s32 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = ashr <4 x i32> %so, <i32 3, i32 3, i32 3, i32 3>
@ -41,8 +37,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqshrni32_umaxmin(<4 x i32> %so) {
; CHECK-LABEL: vqshrni32_umaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u32 q0, q0, #3
; CHECK-NEXT: vmov.i32 q1, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q1
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = lshr <4 x i32> %so, <i32 3, i32 3, i32 3, i32 3>
@ -55,8 +51,8 @@ define arm_aapcs_vfpcc <4 x i32> @vqshrni32_uminmax(<4 x i32> %so) {
; CHECK-LABEL: vqshrni32_uminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u32 q0, q0, #3
; CHECK-NEXT: vmov.i32 q1, #0xffff
; CHECK-NEXT: vmin.u32 q0, q0, q1
; CHECK-NEXT: vqmovnb.u32 q0, q0
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = lshr <4 x i32> %so, <i32 3, i32 3, i32 3, i32 3>
@ -69,10 +65,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqshrni16_smaxmin(<8 x i16> %so) {
; CHECK-LABEL: vqshrni16_smaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s16 q0, q0, #3
; CHECK-NEXT: vmov.i16 q1, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q1
; CHECK-NEXT: vmvn.i16 q1, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q1
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = ashr <8 x i16> %so, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@ -87,10 +81,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqshrni16_sminmax(<8 x i16> %so) {
; CHECK-LABEL: vqshrni16_sminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s16 q0, q0, #3
; CHECK-NEXT: vmvn.i16 q1, #0x7f
; CHECK-NEXT: vmax.s16 q0, q0, q1
; CHECK-NEXT: vmov.i16 q1, #0x7f
; CHECK-NEXT: vmin.s16 q0, q0, q1
; CHECK-NEXT: vqmovnb.s16 q0, q0
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = ashr <8 x i16> %so, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@ -105,8 +97,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqshrni16_umaxmin(<8 x i16> %so) {
; CHECK-LABEL: vqshrni16_umaxmin:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u16 q0, q0, #3
; CHECK-NEXT: vmov.i16 q1, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q1
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = lshr <8 x i16> %so, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@ -119,8 +111,8 @@ define arm_aapcs_vfpcc <8 x i16> @vqshrni16_uminmax(<8 x i16> %so) {
; CHECK-LABEL: vqshrni16_uminmax:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u16 q0, q0, #3
; CHECK-NEXT: vmov.i16 q1, #0xff
; CHECK-NEXT: vmin.u16 q0, q0, q1
; CHECK-NEXT: vqmovnb.u16 q0, q0
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: bx lr
entry:
%s0 = lshr <8 x i16> %so, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>