From ba623f58f9738005a7075db66f06bd363329c011 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 18 Dec 2018 18:26:25 +0000 Subject: [PATCH] [X86] Create PSUBUS from (add (umax X, C), -C) InstCombine seems to canonicalize or PSUB patter into a max with the cosntant and an add with an inverse of the constant. This patch recognizes this pattern and turns it into PSUBUS. Future work could improve undef element handling. Fixes some of PR40053 Differential Revision: https://reviews.llvm.org/D55780 llvm-svn: 349519 --- lib/Target/X86/X86ISelLowering.cpp | 44 +++ test/CodeGen/X86/psubus.ll | 412 +++++++---------------------- 2 files changed, 146 insertions(+), 310 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cc044e1dbf6..7d131661649 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -34108,6 +34108,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If the RHS is a constant we have to reverse the const // canonicalization. // x > C-1 ? x+-C : 0 --> subus x, C + // TODO: Handle build_vectors with undef elements. auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1); }; @@ -40611,6 +40612,46 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, PMADDBuilder); } +// Try to turn (add (umax X, C), -C) into (psubus X, C) +static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + + EVT VT = N->getValueType(0); + + // psubus is available in SSE2 for i8 and i16 vectors. + if (!VT.isVector() || VT.getVectorNumElements() < 2 || + !isPowerOf2_32(VT.getVectorNumElements()) || + !(VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16)) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + if (Op0.getOpcode() != ISD::UMAX) + return SDValue(); + + // The add should have a constant that is the negative of the max. + // TODO: Handle build_vectors with undef elements. + auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) { + return Max->getAPIntValue() == (-Op->getAPIntValue()); + }; + if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT)) + return SDValue(); + + auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef Ops) { + return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops); + }; + + // Take both operands from the umax node. + SDLoc DL(N); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, + { Op0.getOperand(0), Op0.getOperand(1) }, + USUBSATBuilder); +} + // Attempt to turn this pattern into PMADDWD. // (mul (add (zext (build_vector)), (zext (build_vector))), // (add (zext (build_vector)), (zext (build_vector))) @@ -40766,6 +40807,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; + if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll index 9320eed534d..2fc70291b0a 100644 --- a/test/CodeGen/X86/psubus.ll +++ b/test/CodeGen/X86/psubus.ll @@ -2414,14 +2414,12 @@ define void @subus_v2i16(<2 x i16>* %p1, <2 x i16>* %p2) { define <16 x i8> @test19(<16 x i8> %x) { ; SSE-LABEL: test19: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pmaxub {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test19: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = icmp ugt <16 x i8> %x, @@ -2433,14 +2431,12 @@ entry: define <16 x i8> @test20(<16 x i8> %x) { ; SSE-LABEL: test20: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pmaxub {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test20: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = icmp ugt <16 x i8> %x, @@ -2450,34 +2446,14 @@ entry: } define <8 x i16> @test21(<8 x i16> %x) { -; SSE2-LABEL: test21: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test21: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test21: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test21: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test21: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmaxuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = icmp ugt <8 x i16> %x, @@ -2487,34 +2463,14 @@ entry: } define <8 x i16> @test22(<8 x i16> %x) { -; SSE2-LABEL: test22: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test22: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test22: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test22: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test22: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmaxuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: %0 = icmp ugt <8 x i16> %x, @@ -2527,35 +2483,27 @@ define <32 x i8> @test23(<32 x i8> %x) { ; SSE-LABEL: test23: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] -; SSE-NEXT: pmaxub %xmm2, %xmm1 -; SSE-NEXT: pmaxub %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186] -; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm2, %xmm1 +; SSE-NEXT: psubusb %xmm2, %xmm0 +; SSE-NEXT: psubusb %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: test23: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] -; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [186,186,186,186,186,186,186,186,186,186,186,186,186,186,186,186] -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [70,70,70,70,70,70,70,70,70,70,70,70,70,70,70,70] +; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test23: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test23: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <32 x i8> %x, @@ -2567,32 +2515,26 @@ entry: define <32 x i8> @test24(<32 x i8> %x) { ; SSE-LABEL: test24: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pmaxub {{.*}}(%rip), %xmm1 -; SSE-NEXT: pmaxub {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: test24: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test24: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test24: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <32 x i8> %x, @@ -2602,68 +2544,30 @@ entry: } define <16 x i16> @test25(<16 x i16> %x) { -; SSE2-LABEL: test25: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37768,37768,37768,37768,37768,37768,37768,37768] -; SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [60536,60536,60536,60536,60536,60536,60536,60536] -; SSE2-NEXT: paddw %xmm2, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test25: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [37768,37768,37768,37768,37768,37768,37768,37768] -; SSSE3-NEXT: pmaxsw %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pmaxsw %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [60536,60536,60536,60536,60536,60536,60536,60536] -; SSSE3-NEXT: paddw %xmm2, %xmm0 -; SSSE3-NEXT: paddw %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test25: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] -; SSE41-NEXT: pmaxuw %xmm2, %xmm1 -; SSE41-NEXT: pmaxuw %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [60536,60536,60536,60536,60536,60536,60536,60536] -; SSE41-NEXT: paddw %xmm2, %xmm0 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: test25: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] +; SSE-NEXT: psubusw %xmm2, %xmm0 +; SSE-NEXT: psubusw %xmm2, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: test25: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5000,5000,5000,5000,5000,5000,5000,5000] -; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [60536,60536,60536,60536,60536,60536,60536,60536] -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [5000,5000,5000,5000,5000,5000,5000,5000] +; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test25: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmaxuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test25: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <16 x i16> %x, @@ -2673,60 +2577,28 @@ entry: } define <16 x i16> @test26(<16 x i16> %x) { -; SSE2-LABEL: test26: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test26: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test26: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: test26: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: test26: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxuw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpmaxuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test26: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmaxuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test26: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <16 x i16> %x, @@ -2739,49 +2611,35 @@ define <64 x i8> @test27(<64 x i8> %x) { ; SSE-LABEL: test27: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] -; SSE-NEXT: pmaxub %xmm4, %xmm3 -; SSE-NEXT: pmaxub %xmm4, %xmm2 -; SSE-NEXT: pmaxub %xmm4, %xmm1 -; SSE-NEXT: pmaxub %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102] -; SSE-NEXT: paddb %xmm4, %xmm0 -; SSE-NEXT: paddb %xmm4, %xmm1 -; SSE-NEXT: paddb %xmm4, %xmm2 -; SSE-NEXT: paddb %xmm4, %xmm3 +; SSE-NEXT: psubusb %xmm4, %xmm0 +; SSE-NEXT: psubusb %xmm4, %xmm1 +; SSE-NEXT: psubusb %xmm4, %xmm2 +; SSE-NEXT: psubusb %xmm4, %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: test27: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] -; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102] -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] +; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubusb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsubusb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test27: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] -; AVX2-NEXT: vpmaxub %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102,102] -; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test27: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxub {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpsubusb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <64 x i8> %x, @@ -2794,47 +2652,34 @@ define <64 x i8> @test28(<64 x i8> %x) { ; SSE-LABEL: test28: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70] -; SSE-NEXT: pmaxub %xmm4, %xmm2 -; SSE-NEXT: pmaxub %xmm4, %xmm0 -; SSE-NEXT: pmaxub {{.*}}(%rip), %xmm3 -; SSE-NEXT: pmaxub {{.*}}(%rip), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,22,50,114,77,70,133,158,193,237,22,156,231,222,201,186] -; SSE-NEXT: paddb %xmm4, %xmm0 -; SSE-NEXT: paddb %xmm4, %xmm2 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm1 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm3 +; SSE-NEXT: psubusb %xmm4, %xmm0 +; SSE-NEXT: psubusb %xmm4, %xmm2 +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubusb {{.*}}(%rip), %xmm3 ; SSE-NEXT: retq ; ; AVX1-LABEL: test28: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,234,206,142,179,186,123,98,63,19,234,100,25,34,55,70] -; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,22,50,114,77,70,133,158,193,237,22,156,231,222,201,186] -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm2 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vpsubusb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubusb {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test28: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test28: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxub {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpsubusb {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <64 x i8> %x, @@ -2844,88 +2689,35 @@ entry: } define <32 x i16> @test29(<32 x i16> %x) { -; SSE2-LABEL: test29: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm2 -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test29: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pmaxsw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm1 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm2 -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: test29: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm3 -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmaxuw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm2 -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSE41-NEXT: retq +; SSE-LABEL: test29: +; SSE: # %bb.0: # %entry +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm2 +; SSE-NEXT: psubusw {{.*}}(%rip), %xmm3 +; SSE-NEXT: retq ; ; AVX1-LABEL: test29: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpmaxuw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmaxuw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxuw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpmaxuw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsubusw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test29: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmaxuw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpmaxuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test29: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmaxuw {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpsubusw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: retq entry: %0 = icmp ugt <32 x i16> %x,