1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86] SimplifyMultipleUseDemandedBits - Add target shuffle support

llvm-svn: 367782
This commit is contained in:
Simon Pilgrim 2019-08-04 12:24:40 +00:00
parent 303bb3ab3e
commit 607c9c137e
3 changed files with 177 additions and 138 deletions

View File

@ -34706,7 +34706,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
SelectionDAG &DAG, unsigned Depth) const {
int NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
switch (Opc) {
case X86ISD::PINSRB:
case X86ISD::PINSRW: {
@ -34721,6 +34724,49 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
}
}
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 2> ShuffleOps;
if (VT.isSimple() && VT.isVector() &&
resolveTargetShuffleInputs(Op, ShuffleOps, ShuffleMask, DAG, Depth)) {
// If all the demanded elts are from one operand and are inline,
// then we can use the operand directly.
int NumOps = ShuffleOps.size();
if (ShuffleMask.size() == NumElts &&
llvm::all_of(ShuffleOps, [VT](SDValue V) {
return VT.getSizeInBits() == V.getValueSizeInBits();
})) {
// Bitmask that indicates which ops have only been accessed 'inline'.
APInt IdentityOp = APInt::getAllOnesValue(NumOps);
bool AllUndef = true;
for (int i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
if (SM_SentinelUndef == M || !DemandedElts[i])
continue;
AllUndef = false;
int Op = M / NumElts;
int Index = M % NumElts;
if (M < 0 || Index != i) {
IdentityOp.clearAllBits();
break;
}
IdentityOp &= APInt::getOneBitSet(NumOps, Op);
if (IdentityOp == 0)
break;
}
assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected");
if (AllUndef)
return DAG.getUNDEF(VT);
for (int i = 0; i != NumOps; ++i)
if (IdentityOp[i])
return DAG.getBitcast(VT, ShuffleOps[i]);
}
}
return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);
}

View File

@ -562,143 +562,145 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
; SSE2-LABEL: smulo_v6i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: movd %r8d, %xmm9
; SSE2-NEXT: movd %r8d, %xmm8
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
; SSE2-NEXT: movd %edx, %xmm6
; SSE2-NEXT: movd %esi, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
; SSE2-NEXT: movd %edx, %xmm3
; SSE2-NEXT: movd %esi, %xmm6
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
; SSE2-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE2-NEXT: movd %r9d, %xmm12
; SSE2-NEXT: movd {{.*#+}} xmm11 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; SSE2-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm12 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movd %r9d, %xmm13
; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
; SSE2-NEXT: movdqa %xmm13, %xmm11
; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
; SSE2-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSE2-NEXT: pmuludq %xmm7, %xmm13
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
; SSE2-NEXT: pand %xmm6, %xmm5
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pmuludq %xmm5, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm10[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm9[0,0]
; SSE2-NEXT: pmuludq %xmm7, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; SSE2-NEXT: psubd %xmm1, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; SSE2-NEXT: movdqa %xmm7, (%rcx)
; SSE2-NEXT: psrad $31, %xmm7
; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
; SSE2-NEXT: pxor %xmm0, %xmm7
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
; SSE2-NEXT: pand %xmm12, %xmm1
; SSE2-NEXT: pcmpgtd %xmm12, %xmm4
; SSE2-NEXT: pand %xmm2, %xmm4
; SSE2-NEXT: paddd %xmm1, %xmm4
; SSE2-NEXT: pmuludq %xmm12, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSE2-NEXT: pmuludq %xmm8, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: psubd %xmm4, %xmm1
; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: paddd %xmm5, %xmm0
; SSE2-NEXT: pmuludq %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm10[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm8[0,0]
; SSE2-NEXT: pmuludq %xmm12, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE2-NEXT: psubd %xmm0, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, (%rcx)
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
; SSE2-NEXT: pand %xmm11, %xmm3
; SSE2-NEXT: pcmpgtd %xmm11, %xmm4
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: paddd %xmm3, %xmm4
; SSE2-NEXT: pmuludq %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSE2-NEXT: psubd %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: movq %xmm2, 16(%rcx)
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
; SSE2-NEXT: movq %xmm2, 16(%rdi)
; SSE2-NEXT: movdqa %xmm7, (%rdi)
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: movq %xmm3, 16(%rcx)
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm3
; SSE2-NEXT: movq %xmm3, 16(%rdi)
; SSE2-NEXT: movdqa %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: smulo_v6i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %rax
; SSSE3-NEXT: movd %r8d, %xmm9
; SSSE3-NEXT: movd %r8d, %xmm8
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
; SSSE3-NEXT: movd %edx, %xmm6
; SSSE3-NEXT: movd %esi, %xmm5
; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
; SSSE3-NEXT: movd %edx, %xmm3
; SSSE3-NEXT: movd %esi, %xmm6
; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
; SSSE3-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSSE3-NEXT: movd %r9d, %xmm12
; SSSE3-NEXT: movd {{.*#+}} xmm11 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; SSSE3-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm12 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSSE3-NEXT: movd %r9d, %xmm13
; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
; SSSE3-NEXT: movdqa %xmm13, %xmm11
; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
; SSSE3-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
; SSSE3-NEXT: pmuludq %xmm7, %xmm13
; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: pxor %xmm5, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
; SSSE3-NEXT: pand %xmm6, %xmm5
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
; SSSE3-NEXT: paddd %xmm0, %xmm1
; SSSE3-NEXT: pmuludq %xmm5, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm10[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm9[0,0]
; SSSE3-NEXT: pmuludq %xmm7, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; SSSE3-NEXT: psubd %xmm1, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; SSSE3-NEXT: movdqa %xmm7, (%rcx)
; SSSE3-NEXT: psrad $31, %xmm7
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7
; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
; SSSE3-NEXT: pxor %xmm0, %xmm7
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
; SSSE3-NEXT: pand %xmm12, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm12, %xmm4
; SSSE3-NEXT: pand %xmm2, %xmm4
; SSSE3-NEXT: paddd %xmm1, %xmm4
; SSSE3-NEXT: pmuludq %xmm12, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSSE3-NEXT: pmuludq %xmm8, %xmm11
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSSE3-NEXT: psubd %xmm4, %xmm1
; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: paddd %xmm5, %xmm0
; SSSE3-NEXT: pmuludq %xmm6, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm10[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm8[0,0]
; SSSE3-NEXT: pmuludq %xmm12, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSSE3-NEXT: psubd %xmm0, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: movdqa %xmm0, (%rcx)
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3
; SSSE3-NEXT: pand %xmm11, %xmm3
; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4
; SSSE3-NEXT: pand %xmm7, %xmm4
; SSSE3-NEXT: paddd %xmm3, %xmm4
; SSSE3-NEXT: pmuludq %xmm9, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; SSSE3-NEXT: psubd %xmm4, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-NEXT: movq %xmm2, 16(%rcx)
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
; SSSE3-NEXT: pxor %xmm0, %xmm2
; SSSE3-NEXT: movq %xmm2, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm7, (%rdi)
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSSE3-NEXT: movq %xmm3, 16(%rcx)
; SSSE3-NEXT: psrad $31, %xmm3
; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
; SSSE3-NEXT: pxor %xmm1, %xmm3
; SSSE3-NEXT: movq %xmm3, 16(%rdi)
; SSSE3-NEXT: movdqa %xmm0, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: smulo_v6i32:

View File

@ -1589,9 +1589,8 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@ -1603,8 +1602,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@ -1616,8 +1614,7 @@ define i8 @test_v4i8(<4 x i8> %a0) {
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
@ -1650,15 +1647,13 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmullw %xmm2, %xmm0
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlw $8, %xmm1
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: pextrb $0, %xmm1, %eax
; SSE41-NEXT: # kill: def $al killed $al killed $eax
@ -1670,14 +1665,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrb $0, %xmm0, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
@ -1689,14 +1682,12 @@ define i8 @test_v8i8(<8 x i8> %a0) {
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax