1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

[X86][SSE] Use (V)PHMINPOSUW for vXi8 SMAX/SMIN/UMAX/UMIN horizontal reductions (PR32841)

Extension to D39729 which performed this for vXi16, with the same bit flipping to handle SMAX/SMIN/UMAX cases, vXi8 UMIN horizontal reductions can be performed.

This makes use of the fact that by performing a pair-wise i8 SHUFFLE/UMIN before PHMINPOSUW, we both get the UMIN of each pair but also zero-extend the upper bits ready for v8i16.

Differential Revision: https://reviews.llvm.org/D41294

llvm-svn: 321070
This commit is contained in:
Simon Pilgrim 2017-12-19 12:02:40 +00:00
parent 859334a80d
commit 6dd49d11df
5 changed files with 427 additions and 631 deletions

View File

@ -30482,7 +30482,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
}
// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
// PHMINPOSUW.
static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE41.
@ -30490,7 +30491,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
return SDValue();
EVT ExtractVT = Extract->getValueType(0);
if (ExtractVT != MVT::i16)
if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
return SDValue();
// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
@ -30502,7 +30503,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
return SDValue();
SDLoc DL(Extract);
@ -30518,22 +30519,39 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
}
assert(SrcVT == MVT::v8i16 && "Unexpected value type");
assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type");
// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
// to flip the value accordingly.
SDValue Mask;
unsigned MaskEltsBits = ExtractVT.getSizeInBits();
if (BinOp == ISD::SMAX)
Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
else if (BinOp == ISD::SMIN)
Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
else if (BinOp == ISD::UMAX)
Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
// For v16i8 cases we need to perform UMIN on pairs of byte elements,
// shuffling each upper element down and insert zeros. This means that the
// v16i8 UMIN will leave the upper element as zero, performing zero-extension
// ready for the PHMINPOS.
if (ExtractVT == MVT::i8) {
SDValue Upper = DAG.getVectorShuffle(
SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
}
// Perform the PHMINPOS on a v8i16 vector,
MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
MinPos = DAG.getBitcast(SrcVT, MinPos);
if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
@ -30849,7 +30867,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
return Cmp;
// Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;

View File

@ -309,30 +309,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@ -371,30 +366,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@ -906,16 +896,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -924,14 +911,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -940,15 +925,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -994,16 +977,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -1012,14 +992,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -1028,15 +1006,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -1045,15 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@ -1743,16 +1717,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -1764,14 +1735,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -1781,15 +1750,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -1847,16 +1814,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -1868,14 +1832,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -1885,15 +1847,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -1902,17 +1862,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper

View File

@ -311,30 +311,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@ -373,30 +368,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@ -910,16 +900,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -928,14 +915,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -944,15 +929,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -998,16 +981,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -1016,14 +996,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -1032,15 +1010,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -1049,15 +1025,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@ -1745,16 +1719,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -1766,14 +1737,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -1783,15 +1752,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -1849,16 +1816,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -1870,14 +1834,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -1887,15 +1849,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -1904,17 +1864,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper

View File

@ -362,30 +362,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@ -408,30 +403,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@ -1031,16 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -1049,14 +1036,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -1065,15 +1050,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -1099,16 +1082,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -1117,14 +1097,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -1133,15 +1111,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -1150,15 +1126,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@ -1992,16 +1966,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
; X86-SSE42-NEXT: psrlw $8, %xmm2
; X86-SSE42-NEXT: pminub %xmm0, %xmm2
; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X86-SSE42-NEXT: pxor %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -2013,14 +1984,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -2030,15 +1999,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -2068,16 +2035,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
; X64-SSE42-NEXT: psrlw $8, %xmm2
; X64-SSE42-NEXT: pminub %xmm0, %xmm2
; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0
; X64-SSE42-NEXT: pxor %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -2089,14 +2053,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -2106,15 +2068,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -2123,17 +2083,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2
; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper

View File

@ -352,30 +352,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X86-SSE42-LABEL: test_reduce_v16i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: psrlw $8, %xmm1
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
;
; X86-AVX-LABEL: test_reduce_v16i8:
; X86-AVX: ## %bb.0:
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX-NEXT: retl
@ -398,30 +387,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
;
; X64-SSE42-LABEL: test_reduce_v16i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: psrlw $8, %xmm1
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: test_reduce_v16i8:
; X64-AVX: ## %bb.0:
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX-NEXT: retq
@ -1004,16 +982,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-SSE42-LABEL: test_reduce_v32i8:
; X86-SSE42: ## %bb.0:
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: psrlw $8, %xmm1
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -1022,14 +994,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -1038,15 +1005,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X86-AVX2-LABEL: test_reduce_v32i8:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -1072,16 +1034,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-SSE42-LABEL: test_reduce_v32i8:
; X64-SSE42: ## %bb.0:
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: psrlw $8, %xmm1
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -1090,14 +1046,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX1: ## %bb.0:
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -1106,15 +1057,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX2-LABEL: test_reduce_v32i8:
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -1123,15 +1069,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v32i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper
@ -1942,16 +1883,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-SSE42-NEXT: pminub %xmm3, %xmm1
; X86-SSE42-NEXT: pminub %xmm2, %xmm0
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
; X86-SSE42-NEXT: psrld $16, %xmm1
; X86-SSE42-NEXT: psrlw $8, %xmm1
; X86-SSE42-NEXT: pminub %xmm0, %xmm1
; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
; X86-SSE42-NEXT: psrlw $8, %xmm0
; X86-SSE42-NEXT: pminub %xmm1, %xmm0
; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X86-SSE42-NEXT: retl
@ -1963,14 +1898,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX1-NEXT: vzeroupper
@ -1980,15 +1910,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X86-AVX2-NEXT: vzeroupper
@ -2018,16 +1943,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-SSE42-NEXT: pminub %xmm3, %xmm1
; X64-SSE42-NEXT: pminub %xmm2, %xmm0
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
; X64-SSE42-NEXT: psrld $16, %xmm1
; X64-SSE42-NEXT: psrlw $8, %xmm1
; X64-SSE42-NEXT: pminub %xmm0, %xmm1
; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
; X64-SSE42-NEXT: psrlw $8, %xmm0
; X64-SSE42-NEXT: pminub %xmm1, %xmm0
; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0
; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
; X64-SSE42-NEXT: retq
@ -2039,14 +1958,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX1-NEXT: vzeroupper
@ -2056,15 +1970,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX2-NEXT: vzeroupper
@ -2073,17 +1982,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
; X64-AVX512-LABEL: test_reduce_v64i8:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
; X64-AVX512-NEXT: vzeroupper