1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

[x86] transform vector inc/dec to use -1 constant (PR33483)

Convert vector increment or decrement to sub/add with an all-ones constant:

add X, <1, 1...> --> sub X, <-1, -1...>
sub X, <1, 1...> --> add X, <-1, -1...>

The all-ones vector constant can be materialized using a pcmpeq instruction that is 
commonly recognized as an idiom (has no register dependency), so that's better than 
loading a splat 1 constant.

AVX512 uses 'vpternlogd' for 512-bit vectors because there is apparently no better
way to produce 512 one-bits.

The general advantages of this lowering are:
1. pcmpeq has lower latency than a memop on every uarch I looked at in Agner's tables, 
   so in theory, this could be better for perf, but...

2. That seems unlikely to affect any OOO implementation, and I can't measure any real 
   perf difference from this transform on Haswell or Jaguar, but...

3. It doesn't look like it from the diffs, but this is an overall size win because we 
   eliminate 16 - 64 constant bytes in the case of a vector load. If we're broadcasting 
   a scalar load (which might itself be a bug), then we're replacing a scalar constant 
   load + broadcast with a single cheap op, so that should always be smaller/better too.

4. This makes the DAG/isel output more consistent - we use pcmpeq already for padd x, -1 
   and psub x, -1, so we should use that form for +1 too because we can. If there's some
   reason to favor a constant load on some CPU, let's make the reverse transform for all
   of these cases (either here in the DAG or in a later machine pass).

This should fix:
https://bugs.llvm.org/show_bug.cgi?id=33483

Differential Revision: https://reviews.llvm.org/D34336

llvm-svn: 306289
This commit is contained in:
Sanjay Patel 2017-06-26 14:19:26 +00:00
parent f30628d6b1
commit 43a91b22c0
21 changed files with 1756 additions and 1615 deletions

View File

@ -35065,6 +35065,32 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
}
/// Convert vector increment or decrement to sub/add with an all-ones constant:
/// add X, <1, 1...> --> sub X, <-1, -1...>
/// sub X, <1, 1...> --> add X, <-1, -1...>
/// The all-ones vector constant can be materialized using a pcmpeq instruction
/// that is commonly recognized as an idiom (has no register dependency), so
/// that's better/smaller than loading a splat 1 constant.
static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB &&
"Unexpected opcode for increment/decrement transform");
// Pseudo-legality check: getOnesVector() expects one of these types, so bail
// out and wait for legalization if we have an unsupported vector length.
EVT VT = N->getValueType(0);
if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
return SDValue();
SDNode *N1 = N->getOperand(1).getNode();
APInt SplatVal;
if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
return SDValue();
SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
}
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
const SDNodeFlags Flags = N->getFlags();
@ -35084,6 +35110,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, true))
return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
if (SDValue V = combineIncDecVector(N, DAG))
return V;
return combineAddOrSubToADCOrSBB(N, DAG);
}
@ -35117,6 +35146,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
isHorizontalBinOp(Op0, Op1, false))
return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
if (SDValue V = combineIncDecVector(N, DAG))
return V;
return combineAddOrSubToADCOrSBB(N, DAG);
}

File diff suppressed because it is too large Load Diff

View File

@ -388,7 +388,8 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@ -434,9 +435,9 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vmovups %ymm0, (%eax)
; CHECK-NEXT: vzeroupper

View File

@ -930,8 +930,8 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
; AVX-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfb,0xc1]
; AVX-NEXT: vmovntdq %ymm0, (%eax) ## encoding: [0xc5,0xfd,0xe7,0x00]
; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX-NEXT: retl ## encoding: [0xc3]
@ -939,8 +939,8 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX512VL-NEXT: vpaddq LCPI65_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
; AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; AVX512VL-NEXT: retl ## encoding: [0xc3]

View File

@ -247,7 +247,8 @@ entry:
define <2 x i64> @vpandn(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandn:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
entry:
@ -261,7 +262,8 @@ entry:
define <2 x i64> @vpand(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpand:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:

View File

@ -97,14 +97,16 @@ define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind
; AVX1-LABEL: shuffle_v32i8_2323_domain:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_2323_domain:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
entry:
@ -127,14 +129,15 @@ entry:
define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v4i64_6701_domain:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_6701_domain:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; AVX2-NEXT: retq
entry:
@ -148,15 +151,16 @@ define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uw
; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: retq
entry:
@ -169,13 +173,15 @@ entry:
define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v16i16_4501:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_4501:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
entry:
@ -189,14 +195,16 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounw
; AVX1-LABEL: shuffle_v16i16_4501_mem:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_4501_mem:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
; AVX2-NEXT: retq
entry:

View File

@ -382,7 +382,8 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_x86_avx_storeu_dq_256:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vpaddb LCPI34_0, %ymm0, %ymm0
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vmovdqu %ymm0, (%eax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl

View File

@ -5,14 +5,15 @@
define <4 x i64> @vpandn(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpandn:
; X32: ## BB#0: ## %entry
; X32-NEXT: vpaddq LCPI0_0, %ymm0, %ymm1
; X32-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm1
; X32-NEXT: vpandn %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpandn:
; X64: ## BB#0: ## %entry
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; X64-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm1
; X64-NEXT: vpandn %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
entry:
@ -26,14 +27,15 @@ entry:
define <4 x i64> @vpand(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpand:
; X32: ## BB#0: ## %entry
; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X32-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpand:
; X64: ## BB#0: ## %entry
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@ -46,14 +48,15 @@ entry:
define <4 x i64> @vpor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpor:
; X32: ## BB#0: ## %entry
; X32-NEXT: vpaddq LCPI2_0, %ymm0, %ymm0
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X32-NEXT: vpor %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpor:
; X64: ## BB#0: ## %entry
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@ -66,14 +69,15 @@ entry:
define <4 x i64> @vpxor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpxor:
; X32: ## BB#0: ## %entry
; X32-NEXT: vpaddq LCPI3_0, %ymm0, %ymm0
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpxor:
; X64: ## BB#0: ## %entry
; X64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
entry:

View File

@ -321,8 +321,9 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; GENERIC-NEXT: LBB7_6:
; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1
; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0
; GENERIC-NEXT: pcmpeqd %xmm2, %xmm2
; GENERIC-NEXT: paddd %xmm2, %xmm1
; GENERIC-NEXT: paddd %xmm2, %xmm0
; GENERIC-NEXT: movq %xmm0, 16(%rsi)
; GENERIC-NEXT: movdqa %xmm1, (%rsi)
; GENERIC-NEXT: retq
@ -361,8 +362,9 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; ATOM-NEXT: LBB7_6:
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0
; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1
; ATOM-NEXT: pcmpeqd %xmm2, %xmm2
; ATOM-NEXT: paddd %xmm2, %xmm0
; ATOM-NEXT: paddd %xmm2, %xmm1
; ATOM-NEXT: movq %xmm0, 16(%rsi)
; ATOM-NEXT: movdqa %xmm1, (%rsi)
; ATOM-NEXT: retq

View File

@ -83,7 +83,8 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_x86_sse2_storeu_dq:
; CHECK: ## BB#0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: paddb LCPI7_0, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: psubb %xmm1, %xmm0
; CHECK-NEXT: movdqu %xmm0, (%eax)
; CHECK-NEXT: retl
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>

View File

@ -12,20 +12,21 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind {
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: psubq %xmm0, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
; CHECK-NEXT: psrlq $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psubq %xmm0, %xmm2
; CHECK-NEXT: psubq %xmm0, %xmm3
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
; CHECK-NEXT: psrlq $2, %xmm2
; CHECK-NEXT: movdqa %xmm3, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: paddq %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
; CHECK-NEXT: psrlq $4, %xmm0
; CHECK-NEXT: paddq %xmm2, %xmm0
; CHECK-NEXT: paddq %xmm3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq
@ -115,20 +116,21 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: psubq %xmm0, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: psubq {{.*}}(%rip), %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
; CHECK-NEXT: psrlq $1, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psubq %xmm0, %xmm2
; CHECK-NEXT: psubq %xmm0, %xmm3
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
; CHECK-NEXT: psrlq $2, %xmm2
; CHECK-NEXT: movdqa %xmm3, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: paddq %xmm3, %xmm2
; CHECK-NEXT: movdqa %xmm2, %xmm0
; CHECK-NEXT: psrlq $2, %xmm3
; CHECK-NEXT: pand %xmm0, %xmm3
; CHECK-NEXT: paddq %xmm2, %xmm3
; CHECK-NEXT: movdqa %xmm3, %xmm0
; CHECK-NEXT: psrlq $4, %xmm0
; CHECK-NEXT: paddq %xmm2, %xmm0
; CHECK-NEXT: paddq %xmm3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: psadbw %xmm1, %xmm0
; CHECK-NEXT: retq

File diff suppressed because it is too large Load Diff

View File

@ -15,8 +15,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -28,7 +28,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -44,8 +44,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -62,7 +62,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512CDVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -79,8 +80,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX512CD-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -97,8 +98,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512VPOPCNTDQ-NEXT: retq
@ -130,8 +131,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -143,7 +144,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -159,8 +160,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -197,8 +198,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512VPOPCNTDQ-NEXT: retq
@ -230,8 +231,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -247,7 +248,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -267,8 +268,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -289,7 +290,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512CDVL-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -310,8 +312,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX512CD-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -332,8 +334,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512VPOPCNTDQ-NEXT: retq
@ -343,8 +345,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
; X32-AVX-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -370,8 +372,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -387,7 +389,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -407,8 +409,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -449,8 +451,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512VPOPCNTDQ-NEXT: retq
@ -460,8 +462,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2
; X32-AVX-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -486,8 +488,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -502,7 +504,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -520,7 +522,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -539,7 +542,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -558,7 +562,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -577,7 +582,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -596,7 +602,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpsubw {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -619,8 +626,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -635,7 +642,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsubw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -653,7 +660,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -672,7 +680,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -691,7 +700,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -710,7 +720,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -729,7 +740,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpsubw {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -753,8 +765,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -765,7 +777,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -780,7 +792,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -796,7 +809,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -812,7 +826,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -828,7 +843,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -844,7 +860,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpsubb {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -865,8 +882,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -877,7 +894,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
@ -892,7 +909,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -908,7 +926,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -924,7 +943,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -940,7 +960,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -956,7 +977,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpsubb {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]

View File

@ -10,7 +10,8 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3
@ -37,7 +38,8 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512CDBW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -54,7 +56,8 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -71,7 +74,8 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
@ -104,7 +108,8 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -121,7 +126,8 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
@ -134,7 +140,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3
@ -169,7 +176,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -190,7 +198,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -211,7 +220,8 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
@ -244,7 +254,8 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -265,7 +276,8 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
@ -278,8 +290,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512CD-NEXT: vpsubw %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512CD-NEXT: vpaddw %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -293,7 +305,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubw %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -310,7 +322,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -329,7 +342,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -348,8 +362,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -363,7 +377,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -384,8 +398,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512CD-NEXT: vpsubw %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512CD-NEXT: vpaddw %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -399,7 +413,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubw %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -416,7 +430,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -435,7 +450,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -454,8 +470,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -469,7 +485,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -490,8 +506,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512CD-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -502,7 +518,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -516,7 +532,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -532,7 +549,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -548,8 +566,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -560,7 +578,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -578,8 +596,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512CD-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -590,7 +608,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
@ -604,7 +622,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -620,7 +639,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -636,8 +656,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@ -648,7 +668,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1

View File

@ -6,7 +6,7 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,u>
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .p2align 4, 0x90
@ -16,7 +16,7 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: pmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; CHECK-NEXT: paddd %xmm0, %xmm2
; CHECK-NEXT: psubd %xmm0, %xmm2
; CHECK-NEXT: pextrb $8, %xmm2, 2(%ecx,%eax,4)
; CHECK-NEXT: pshufb %xmm1, %xmm2
; CHECK-NEXT: pextrw $0, %xmm2, (%ecx,%eax,4)

View File

@ -8,7 +8,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: jmp .LBB0_1
@ -26,7 +26,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; CHECK-NEXT: movl (%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; CHECK-NEXT: paddw %xmm0, %xmm3
; CHECK-NEXT: psubw %xmm0, %xmm3
; CHECK-NEXT: pand %xmm1, %xmm3
; CHECK-NEXT: pshufb %xmm2, %xmm3
; CHECK-NEXT: movq %xmm3, (%edx,%ecx,8)

View File

@ -14,8 +14,8 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
; CHECK-NEXT: andl $-8, %esp
; CHECK-NEXT: subl $40, %esp
; CHECK-NEXT: movl {{\.LCPI.*}}, %eax
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,u>
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
@ -29,7 +29,7 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; CHECK-NEXT: pinsrd $2, 4(%edx,%eax,8), %xmm2
; CHECK-NEXT: paddd %xmm0, %xmm2
; CHECK-NEXT: psubd %xmm0, %xmm2
; CHECK-NEXT: pextrw $4, %xmm2, 4(%ecx,%eax,8)
; CHECK-NEXT: pshufb %xmm1, %xmm2
; CHECK-NEXT: movd %xmm2, (%ecx,%eax,8)

View File

@ -7,8 +7,7 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,1,1,1,u,u>
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: cmpl $3, (%esp)
; CHECK-NEXT: jg .LBB0_3
; CHECK-NEXT: .p2align 4, 0x90
@ -18,14 +17,14 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: shll $5, %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movdqa (%edx,%eax), %xmm2
; CHECK-NEXT: paddw %xmm0, %xmm2
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm3
; CHECK-NEXT: paddw %xmm1, %xmm3
; CHECK-NEXT: pextrd $2, %xmm3, 24(%ecx,%eax)
; CHECK-NEXT: pextrd $1, %xmm3, 20(%ecx,%eax)
; CHECK-NEXT: movd %xmm3, 16(%ecx,%eax)
; CHECK-NEXT: movdqa %xmm2, (%ecx,%eax)
; CHECK-NEXT: movdqa (%edx,%eax), %xmm1
; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2
; CHECK-NEXT: psubw %xmm0, %xmm1
; CHECK-NEXT: psubw %xmm0, %xmm2
; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax)
; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: cmpl $3, (%esp)
; CHECK-NEXT: jle .LBB0_2

View File

@ -8,7 +8,8 @@ define void @convert(<12 x i8>* %dst.addr, <3 x i32> %src) nounwind {
; X86-LABEL: convert:
; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
; X86-NEXT: pextrd $2, %xmm0, 8(%eax)
; X86-NEXT: pextrd $1, %xmm0, 4(%eax)
; X86-NEXT: movd %xmm0, (%eax)
@ -16,7 +17,8 @@ define void @convert(<12 x i8>* %dst.addr, <3 x i32> %src) nounwind {
;
; X64-LABEL: convert:
; X64: # BB#0:
; X64-NEXT: paddd {{.*}}(%rip), %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: psubd %xmm1, %xmm0
; X64-NEXT: pextrd $2, %xmm0, 8(%rdi)
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq

View File

@ -9,7 +9,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; NARROW: # BB#0: # %entry
; NARROW-NEXT: subl $12, %esp
; NARROW-NEXT: movl $0, (%esp)
; NARROW-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1]
; NARROW-NEXT: pcmpeqd %xmm0, %xmm0
; NARROW-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; NARROW-NEXT: jmp .LBB0_1
; NARROW-NEXT: .p2align 4, 0x90
@ -26,7 +26,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; NARROW-NEXT: movl (%esp), %ecx
; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx
; NARROW-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; NARROW-NEXT: paddw %xmm0, %xmm2
; NARROW-NEXT: psubw %xmm0, %xmm2
; NARROW-NEXT: psllw $8, %xmm2
; NARROW-NEXT: psraw $8, %xmm2
; NARROW-NEXT: psraw $2, %xmm2
@ -46,7 +46,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; WIDE: # BB#0: # %entry
; WIDE-NEXT: subl $12, %esp
; WIDE-NEXT: movl $0, (%esp)
; WIDE-NEXT: movdqa {{.*#+}} xmm0 = <1,1,1,1,1,1,1,1,u,u,u,u,u,u,u,u>
; WIDE-NEXT: pcmpeqd %xmm0, %xmm0
; WIDE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; WIDE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; WIDE-NEXT: jmp .LBB0_1
@ -65,7 +65,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIDE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; WIDE-NEXT: pinsrd $1, 4(%eax,%ecx,8), %xmm3
; WIDE-NEXT: paddb %xmm0, %xmm3
; WIDE-NEXT: psubb %xmm0, %xmm3
; WIDE-NEXT: psrlw $2, %xmm3
; WIDE-NEXT: pand %xmm1, %xmm3
; WIDE-NEXT: pxor %xmm2, %xmm3

View File

@ -35,7 +35,8 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movdqa (%ecx), %xmm0
; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: pextrw $0, %xmm0, (%eax)
@ -45,7 +46,8 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
; X64-LABEL: convert_v3i32_to_v3i8:
; X64: # BB#0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: paddd {{.*}}(%rip), %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: psubd %xmm1, %xmm0
; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: pextrw $0, %xmm0, (%rdi)
@ -70,7 +72,8 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: movdqa (%ecx), %xmm0
; X86-NEXT: paddw {{\.LCPI.*}}, %xmm0
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubw %xmm1, %xmm0
; X86-NEXT: pextrb $8, %xmm0, 4(%eax)
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; X86-NEXT: movd %xmm0, (%eax)
@ -81,7 +84,8 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
; X64-LABEL: convert_v5i16_to_v5i8:
; X64: # BB#0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: paddw {{.*}}(%rip), %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: psubw %xmm1, %xmm0
; X64-NEXT: pextrb $8, %xmm0, 4(%rdi)
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rdi)