1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 19:42:54 +02:00

[SelectionDAG] Add SimplifyDemandedBits to SimplifyDemandedVectorElts simplification

This patch enables SimplifyDemandedBits to call SimplifyDemandedVectorElts in cases where the demanded bits mask covers entire elements of a bitcasted source vector.

There are a couple of cases here where simplification at a deeper level (such as through bitcasts) prevents further simplification - CommitTargetLoweringOpt only adds immediate uses/users back to the worklist when we might want to combine the original caller again to see what else it can simplify.

As well as that I had to disable handling of bool vector until SimplifyDemandedVectorElts better supports some of their opcodes (SETCC, shifts etc.).

Fixes PR39178

Differential Revision: https://reviews.llvm.org/D52935

llvm-svn: 343913
This commit is contained in:
Simon Pilgrim 2018-10-06 10:20:04 +00:00
parent 1f5778cb15
commit e1793d6523
18 changed files with 360 additions and 525 deletions

View File

@ -1179,29 +1179,64 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
Known.Zero |= ~InMask;
break;
}
case ISD::BITCAST:
case ISD::BITCAST: {
SDValue Src = Op.getOperand(0);
EVT SrcVT = Src.getValueType();
unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
// If this is an FP->Int bitcast and if the sign bit is the only
// thing demanded, turn this into a FGETSIGN.
if (!TLO.LegalOperations() && !VT.isVector() &&
!Op.getOperand(0).getValueType().isVector() &&
if (!TLO.LegalOperations() && !VT.isVector() && !SrcVT.isVector() &&
NewMask == APInt::getSignMask(Op.getValueSizeInBits()) &&
Op.getOperand(0).getValueType().isFloatingPoint()) {
SrcVT.isFloatingPoint()) {
bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, VT);
bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
if ((OpVTLegal || i32Legal) && VT.isSimple() &&
Op.getOperand(0).getValueType() != MVT::f16 &&
Op.getOperand(0).getValueType() != MVT::f128) {
bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
if ((OpVTLegal || i32Legal) && VT.isSimple() && SrcVT != MVT::f16 &&
SrcVT != MVT::f128) {
// Cannot eliminate/lower SHL for f128 yet.
EVT Ty = OpVTLegal ? VT : MVT::i32;
// Make a FGETSIGN + SHL to move the sign bit into the appropriate
// place. We expect the SHL to be eliminated by other optimizations.
SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0));
SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Src);
unsigned OpVTSizeInBits = Op.getValueSizeInBits();
if (!OpVTLegal && OpVTSizeInBits > 32)
Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Sign);
unsigned ShVal = Op.getValueSizeInBits() - 1;
SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, VT);
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
return TLO.CombineTo(Op,
TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
}
}
// If bitcast from a vector and the mask covers entire elements, see if we
// can use SimplifyDemandedVectorElts.
// TODO - bigendian once we have test coverage.
// TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support.
if (SrcVT.isVector() && NumSrcEltBits > 1 &&
(BitWidth % NumSrcEltBits) == 0 &&
TLO.DAG.getDataLayout().isLittleEndian()) {
unsigned Scale = BitWidth / NumSrcEltBits;
auto GetDemandedSubMask = [&](APInt &DemandedSubElts) -> bool {
DemandedSubElts = APInt::getNullValue(Scale);
for (unsigned i = 0; i != Scale; ++i) {
unsigned Offset = i * NumSrcEltBits;
APInt Sub = DemandedMask.extractBits(NumSrcEltBits, Offset);
if (Sub.isAllOnesValue())
DemandedSubElts.setBit(i);
else if (!Sub.isNullValue())
return false;
}
return true;
};
APInt DemandedSubElts;
if (GetDemandedSubMask(DemandedSubElts)) {
unsigned NumSrcElts = SrcVT.getVectorNumElements();
APInt DemandedElts = APInt::getSplat(NumSrcElts, DemandedSubElts);
APInt KnownUndef, KnownZero;
if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
TLO, Depth + 1))
return true;
}
}
// If this is a bitcast, let computeKnownBits handle it. Only do this on a
@ -1211,6 +1246,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
return false;
}
break;
}
case ISD::ADD:
case ISD::MUL:
case ISD::SUB: {

View File

@ -1826,12 +1826,6 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_mm256_mul_epi32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2
; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2
; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
@ -1846,9 +1840,6 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_mm256_mul_epu32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>

View File

@ -1718,11 +1718,6 @@ entry:
define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
; CHECK-LABEL: test_mm512_mul_epu32:
; CHECK: # %bb.0:
; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA
; CHECK-NEXT: kmovw %eax, %k0
; CHECK-NEXT: knotw %k0, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: ret{{[l|q]}}
%tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>

View File

@ -4519,9 +4519,7 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mul_epi32_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
@ -4541,9 +4539,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
; X86-LABEL: test_mask_mul_epi32_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: ## xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xca]
@ -4568,9 +4564,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
; X86-LABEL: test_mask_mul_epi32_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
@ -4696,9 +4690,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mul_epu32_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
@ -4718,9 +4710,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
; X86-LABEL: test_mask_mul_epu32_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: ## xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
@ -4745,9 +4735,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
; X86-LABEL: test_mask_mul_epu32_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
@ -6160,9 +6148,7 @@ define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mul_epi32_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
@ -6182,9 +6168,7 @@ define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %pas
; X86-LABEL: test_mul_epi32_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: ## xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xca]
@ -6211,9 +6195,7 @@ define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; X86-LABEL: test_mul_epi32_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
@ -6349,9 +6331,7 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mul_epu32_rmb:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
; X86-NEXT: retl ## encoding: [0xc3]
;
@ -6371,9 +6351,7 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %pas
; X86-LABEL: test_mul_epu32_rmbk:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: ## xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
; X86-NEXT: vpbroadcastd (%eax), %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
@ -6400,9 +6378,7 @@ define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; X86-LABEL: test_mul_epu32_rmbkz:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: ## xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]

View File

@ -9655,7 +9655,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mul_epi32_rmb_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08]
; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08]
; X86-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@ -9675,7 +9675,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
; X86-LABEL: test_mask_mul_epi32_rmbk_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10]
; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x28,0xca]
@ -9700,7 +9700,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8
; X86-LABEL: test_mask_mul_epi32_rmbkz_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08]
; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
@ -9826,9 +9826,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mul_epi32_rmb_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08]
; X86-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@ -9848,9 +9846,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
; X86-LABEL: test_mask_mul_epi32_rmbk_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: # xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd2]
; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x28,0xca]
@ -9875,9 +9871,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8
; X86-LABEL: test_mask_mul_epi32_rmbkz_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
@ -10003,7 +9997,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mul_epu32_rmb_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08]
; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08]
; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@ -10023,7 +10017,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
; X86-LABEL: test_mask_mul_epu32_rmbk_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10]
; X86-NEXT: vpbroadcastd (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xca]
@ -10048,7 +10042,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8
; X86-LABEL: test_mask_mul_epu32_rmbkz_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08]
; X86-NEXT: vpbroadcastd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
@ -10174,9 +10168,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
; X86-LABEL: test_mask_mul_epu32_rmb_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08]
; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
;
@ -10196,9 +10188,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
; X86-LABEL: test_mask_mul_epu32_rmbk_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
; X86-NEXT: # xmm2 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd2]
; X86-NEXT: vpbroadcastd (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x10]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xca]
@ -10223,9 +10213,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8
; X86-LABEL: test_mask_mul_epu32_rmbkz_256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: vmovq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
; X86-NEXT: # xmm1 = mem[0],zero
; X86-NEXT: vpbroadcastq %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc9]
; X86-NEXT: vpbroadcastd (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x08]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]

View File

@ -41,20 +41,15 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
ret <2 x i64> %5
}
; TODO - blends are superfluous
define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zero_pmuludq:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: combine_shuffle_zero_pmuludq:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@ -62,7 +57,6 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; AVX512VL-LABEL: combine_shuffle_zero_pmuludq:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
@ -70,7 +64,6 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq:
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: retq
@ -82,23 +75,16 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
ret <2 x i64> %5
}
; TODO - blends are superfluous
define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zero_pmuludq_256:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
; SSE-NEXT: pmuludq %xmm3, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: pmuludq %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX2-LABEL: combine_shuffle_zero_pmuludq_256:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@ -106,7 +92,6 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
; AVX512VL-LABEL: combine_shuffle_zero_pmuludq_256:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
@ -114,7 +99,6 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
; AVX512DQVL-LABEL: combine_shuffle_zero_pmuludq_256:
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: retq

View File

@ -402,23 +402,21 @@ define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: psrad $5, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $5, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $4, %xmm3
; SSE2-NEXT: psrad $3, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: psrad $3, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: psrad $4, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1:
@ -466,23 +464,21 @@ define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: psrad $7, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $7, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $6, %xmm3
; SSE2-NEXT: psrad $5, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: psrad $5, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $8, %xmm1
; SSE2-NEXT: psrad $6, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1:
@ -533,23 +529,21 @@ define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_gt_lshr1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: psrld $5, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld $5, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrld $4, %xmm3
; SSE2-NEXT: psrld $3, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: psrld $3, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: psrld $4, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_gt_lshr1:
@ -600,23 +594,21 @@ define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
; SSE2-LABEL: combine_vec_shl_le_lshr1:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: psrld $7, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld $7, %xmm2
; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrld $6, %xmm3
; SSE2-NEXT: psrld $5, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: psrld $5, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $8, %xmm1
; SSE2-NEXT: psrld $6, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: combine_vec_shl_le_lshr1:

View File

@ -39,24 +39,18 @@ define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) {
define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) {
; SSE2-LABEL: _mul2xi32b:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: retq
;
; SSE42-LABEL: _mul2xi32b:
; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE42-NEXT: pmuludq %xmm0, %xmm1
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE42-NEXT: pmuludq %xmm1, %xmm0
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE42-NEXT: retq
;
; AVX-LABEL: _mul2xi32b:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
@ -349,27 +343,13 @@ define <2 x i64> @_mul2xi64toi64a(<2 x i64>, <2 x i64>) {
;
; SSE42-LABEL: _mul2xi64toi64a:
; SSE42: # %bb.0:
; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; SSE42-NEXT: pmuludq %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: _mul2xi64toi64a:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _mul2xi64toi64a:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
; AVX-LABEL: _mul2xi64toi64a:
; AVX: # %bb.0:
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%f00 = extractelement <2 x i64> %0, i32 0
%f01 = extractelement <2 x i64> %0, i32 1
%f10 = extractelement <2 x i64> %1, i32 0

View File

@ -1318,76 +1318,55 @@ entry:
define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_sext:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm10
; SSE2-NEXT: movdqa %xmm1, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; SSE2-NEXT: pxor %xmm7, %xmm7
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm5, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm0, %xmm4
; SSE2-NEXT: paddq %xmm3, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pmuludq %xmm5, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: psrad $16, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm3, %xmm4
; SSE2-NEXT: paddq %xmm5, %xmm4
; SSE2-NEXT: movdqa %xmm6, %xmm5
; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: movdqa %xmm6, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: psrad $16, %xmm6
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: paddq %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm2, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm6, %xmm4
; SSE2-NEXT: paddq %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pmuludq %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm5, %xmm6
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm4, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm7, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: psrad $16, %xmm7
; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: psrad $16, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: paddq %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
; SSE2-NEXT: pmuludq %xmm3, %xmm6
; SSE2-NEXT: pmuludq %xmm5, %xmm4
; SSE2-NEXT: paddq %xmm6, %xmm4
; SSE2-NEXT: pmuludq %xmm5, %xmm3
; SSE2-NEXT: psllq $32, %xmm4
; SSE2-NEXT: paddq %xmm4, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
; SSE2-NEXT: pmuludq %xmm9, %xmm4
; SSE2-NEXT: pxor %xmm12, %xmm12
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
; SSE2-NEXT: pmuludq %xmm9, %xmm0
; SSE2-NEXT: psllq $32, %xmm0
; SSE2-NEXT: paddq %xmm4, %xmm0
; SSE2-NEXT: pmuludq %xmm11, %xmm5
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
; SSE2-NEXT: pmuludq %xmm11, %xmm1
; SSE2-NEXT: psllq $32, %xmm1
; SSE2-NEXT: paddq %xmm5, %xmm1
; SSE2-NEXT: pmuludq %xmm10, %xmm6
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1]
; SSE2-NEXT: pmuludq %xmm10, %xmm2
; SSE2-NEXT: psllq $32, %xmm2
; SSE2-NEXT: paddq %xmm6, %xmm2
; SSE2-NEXT: pmuludq %xmm8, %xmm7
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1]
; SSE2-NEXT: pmuludq %xmm8, %xmm3
; SSE2-NEXT: psllq $32, %xmm3
; SSE2-NEXT: paddq %xmm7, %xmm3
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_sext:

View File

@ -7,7 +7,7 @@
define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32], [128 x i64] }*) nounwind {
; X86-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
; X86-SKYLAKE: # %bb.0: # %entry
; X86-SKYLAKE-NEXT: subl $12, %esp
; X86-SKYLAKE-NEXT: subl $8, %esp
; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SKYLAKE-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@ -21,12 +21,12 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X86-SKYLAKE-NEXT: vmovd %xmm0, %ecx
; X86-SKYLAKE-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
; X86-SKYLAKE-NEXT: movl %ecx, (%eax)
; X86-SKYLAKE-NEXT: addl $12, %esp
; X86-SKYLAKE-NEXT: addl $8, %esp
; X86-SKYLAKE-NEXT: retl
;
; X86-SKX-LABEL: fetch_r16g16_snorm_unorm8:
; X86-SKX: # %bb.0: # %entry
; X86-SKX-NEXT: subl $12, %esp
; X86-SKX-NEXT: subl $8, %esp
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
@ -35,19 +35,16 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X86-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X86-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X86-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; X86-SKX-NEXT: vpmovqw %xmm1, {{[0-9]+}}(%esp)
; X86-SKX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X86-SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X86-SKX-NEXT: vpmovqw %xmm0, {{[0-9]+}}(%esp)
; X86-SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-SKX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
; X86-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X86-SKX-NEXT: vpmovdb %xmm0, (%esp)
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKX-NEXT: movzwl (%esp), %ecx
; X86-SKX-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
; X86-SKX-NEXT: movl %ecx, (%eax)
; X86-SKX-NEXT: addl $12, %esp
; X86-SKX-NEXT: addl $8, %esp
; X86-SKX-NEXT: retl
;
; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
@ -74,13 +71,10 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
; X64-SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; X64-SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; X64-SKX-NEXT: vpsrld $7, %xmm0, %xmm0
; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-SKX-NEXT: vpmovqw %xmm1, -{{[0-9]+}}(%rsp)
; X64-SKX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-SKX-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
; X64-SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-SKX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
; X64-SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-SKX-NEXT: vpmovdb %xmm0, -{{[0-9]+}}(%rsp)
; X64-SKX-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
; X64-SKX-NEXT: orl $-16777216, %eax # imm = 0xFF000000

View File

@ -1507,7 +1507,7 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@ -1643,7 +1643,7 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX-NEXT: movl c, %edx
; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
; X86-AVX-NEXT: retl
@ -2047,12 +2047,16 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65536,0]
; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,u,65536,u>
; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
; X86-SSE-NEXT: psllq $32, %xmm1
; X86-SSE-NEXT: paddq %xmm0, %xmm1
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-SSE-NEXT: movdqa %xmm2, %xmm3
; X86-SSE-NEXT: psrlq $32, %xmm3
; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
; X86-SSE-NEXT: paddq %xmm1, %xmm3
; X86-SSE-NEXT: psllq $32, %xmm3
; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
; X86-SSE-NEXT: paddq %xmm3, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;
@ -2128,13 +2132,17 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; X86-SSE-NEXT: psrad $16, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0]
; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,u,32768,u>
; X86-SSE-NEXT: pxor %xmm2, %xmm2
; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
; X86-SSE-NEXT: psllq $32, %xmm2
; X86-SSE-NEXT: paddq %xmm0, %xmm2
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; X86-SSE-NEXT: movdqa %xmm1, %xmm3
; X86-SSE-NEXT: psrlq $32, %xmm3
; X86-SSE-NEXT: pmuludq %xmm0, %xmm3
; X86-SSE-NEXT: paddq %xmm2, %xmm3
; X86-SSE-NEXT: psllq $32, %xmm3
; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
; X86-SSE-NEXT: paddq %xmm3, %xmm0
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
; X86-SSE-NEXT: retl
;

View File

@ -2755,23 +2755,13 @@ define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X86-SSE-LABEL: test_mm_mul_epu32:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
; X86-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
; X86-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
; X86-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2]
; X86-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca]
; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1]
; X86-SSE-NEXT: retl # encoding: [0xc3]
; SSE-LABEL: test_mm_mul_epu32:
; SSE: # %bb.0:
; SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1]
; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX1-LABEL: test_mm_mul_epu32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xef,0xd2]
; AVX1-NEXT: vpblendw $204, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xcc]
; AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpblendw $204, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xcc]
; AVX1-NEXT: # xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1]
; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
@ -2784,16 +2774,6 @@ define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; AVX512-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; X64-SSE-LABEL: test_mm_mul_epu32:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
; X64-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A]
; X64-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
; X64-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2]
; X64-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca]
; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1]
; X64-SSE-NEXT: retq # encoding: [0xc3]
%A = and <2 x i64> %a0, <i64 4294967295, i64 4294967295>
%B = and <2 x i64> %a1, <i64 4294967295, i64 4294967295>
%res = mul nuw <2 x i64> %A, %B

View File

@ -832,26 +832,11 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: test_mm_mul_epi32:
; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psllq $32, %xmm2
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: psllq $32, %xmm0
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; SSE-NEXT: pmuldq %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pmuldq %xmm1, %xmm0
; SSE-NEXT: ret{{[l|q]}}
;
; AVX1-LABEL: test_mm_mul_epi32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: ret{{[l|q]}}
;

View File

@ -143,33 +143,31 @@ define <4 x i32> @test_urem_odd_div_nonsplat(<4 x i32> %X) nounwind readnone {
define <4 x i32> @test_urem_even_div_nonsplat(<4 x i32> %X) nounwind readnone {
; CHECK-SSE2-LABEL: test_urem_even_div_nonsplat:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,2454267027]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,3435973837,2863311531,2454267027]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
; CHECK-SSE2-NEXT: psrld $3, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE2-NEXT: psrld $1, %xmm3
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE2-NEXT: psrld $2, %xmm1
; CHECK-SSE2-NEXT: psrld $3, %xmm2
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,14]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@ -277,20 +275,17 @@ define <4 x i32> @test_urem_pow2_nonsplat(<4 x i32> %X) nounwind readnone {
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE2-NEXT: psrld $3, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4
; CHECK-SSE2-NEXT: psrld $2, %xmm4
; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [6,10,12,16]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [6,10,12,16]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
; CHECK-SSE2-NEXT: psrld $2, %xmm2
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@ -382,34 +377,31 @@ define <4 x i32> @test_urem_pow2_nonsplat(<4 x i32> %X) nounwind readnone {
define <4 x i32> @test_urem_one_nonsplat(<4 x i32> %X) nounwind readnone {
; CHECK-SSE2-LABEL: test_urem_one_nonsplat:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: psrld $1, %xmm1
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,0,2863311531,2454267027]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2863311531,0,2863311531,2454267027]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0]
; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
; CHECK-SSE2-NEXT: psrld $2, %xmm2
; CHECK-SSE2-NEXT: psrld $3, %xmm1
; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,1,12,14]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE2-NEXT: psrld $1, %xmm3
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
; CHECK-SSE2-NEXT: psrld $2, %xmm1
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
; CHECK-SSE2-NEXT: psrld $3, %xmm2
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [6,1,12,14]
; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0

View File

@ -627,11 +627,10 @@ define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X86-NEXT: pand {{\.LCPI.*}}, %xmm1
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
;

View File

@ -460,7 +460,7 @@ define <16 x i8> @mul_v16i8_neg5(<16 x i8> %a0) nounwind {
define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_17_65:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0]
; X86-NEXT: movdqa {{.*#+}} xmm1 = <17,u,65,u>
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: psrlq $32, %xmm0
@ -809,7 +809,7 @@ define <16 x i8> @mul_v16i8_neg15(<16 x i8> %a0) nounwind {
define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_15_63:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0]
; X86-NEXT: movdqa {{.*#+}} xmm1 = <15,u,63,u>
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: psrlq $32, %xmm0
@ -845,16 +845,17 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_neg_15_63:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295]
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: movdqa %xmm0, %xmm3
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlq $32, %xmm1
; X86-NEXT: movdqa {{.*#+}} xmm2 = <4294967281,u,4294967233,u>
; X86-NEXT: pmuludq %xmm2, %xmm1
; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
; X86-NEXT: pmuludq %xmm1, %xmm3
; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
; X86-NEXT: pmuludq %xmm0, %xmm3
; X86-NEXT: paddq %xmm1, %xmm3
; X86-NEXT: psllq $32, %xmm3
; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: paddq %xmm3, %xmm0
; X86-NEXT: psllq $32, %xmm0
; X86-NEXT: paddq %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_v2i64_neg_15_63:
@ -889,16 +890,17 @@ define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_neg_17_65:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295]
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: movdqa %xmm0, %xmm3
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlq $32, %xmm1
; X86-NEXT: movdqa {{.*#+}} xmm2 = <4294967279,u,4294967231,u>
; X86-NEXT: pmuludq %xmm2, %xmm1
; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
; X86-NEXT: pmuludq %xmm1, %xmm3
; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
; X86-NEXT: pmuludq %xmm0, %xmm3
; X86-NEXT: paddq %xmm1, %xmm3
; X86-NEXT: psllq $32, %xmm3
; X86-NEXT: pmuludq %xmm2, %xmm0
; X86-NEXT: paddq %xmm3, %xmm0
; X86-NEXT: psllq $32, %xmm0
; X86-NEXT: paddq %xmm2, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_v2i64_neg_17_65:
@ -933,7 +935,7 @@ define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
; X86-LABEL: mul_v2i64_0_1:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0]
; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,u,1,u>
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: psrlq $32, %xmm0
@ -975,7 +977,7 @@ define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
; X86: # %bb.0:
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlq $32, %xmm1
; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295]
; X86-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u>
; X86-NEXT: pmuludq %xmm2, %xmm1
; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
@ -1029,7 +1031,7 @@ define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
; X86: # %bb.0:
; X86-NEXT: movdqa %xmm0, %xmm1
; X86-NEXT: psrlq $32, %xmm1
; X86-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295]
; X86-NEXT: movdqa {{.*#+}} xmm2 = <15,u,4294967233,u>
; X86-NEXT: pmuludq %xmm2, %xmm1
; X86-NEXT: movdqa %xmm2, %xmm3
; X86-NEXT: psrlq $32, %xmm3
@ -1172,7 +1174,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
; X86-LABEL: mul_v2i64_68_132:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0]
; X86-NEXT: movdqa {{.*#+}} xmm1 = <68,u,132,u>
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: psrlq $32, %xmm0
@ -1208,7 +1210,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
; X86-LABEL: mul_v2i64_60_120:
; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0]
; X86-NEXT: movdqa {{.*#+}} xmm1 = <60,u,124,u>
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: psrlq $32, %xmm0

View File

@ -835,9 +835,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
; SSE2-NEXT: pmuludq %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,0,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
@ -896,34 +895,25 @@ define i32 @test_v8i32(<8 x i32> %a0) {
define i32 @test_v16i32(<16 x i32> %a0) {
; SSE2-LABEL: test_v16i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm6
; SSE2-NEXT: pmuludq %xmm5, %xmm6
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,0,0]
; SSE2-NEXT: pmuludq %xmm6, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
@ -992,64 +982,39 @@ define i32 @test_v32i32(<32 x i32> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm6, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm8, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; SSE2-NEXT: pmuludq %xmm2, %xmm6
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pmuludq %xmm3, %xmm1
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm8, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm9, %xmm3
; SSE2-NEXT: pmuludq %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm10, %xmm2
; SSE2-NEXT: pmuludq %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm3, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0]
; SSE2-NEXT: pmuludq %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,0,0]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v32i32:

View File

@ -5595,40 +5595,29 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_self_v4i64_v4i32:
; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psrad $31, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: movdqa %xmm0, %xmm6
; SSE-NEXT: psrad $31, %xmm6
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
; SSE-NEXT: movdqa %xmm4, %xmm5
; SSE-NEXT: psrad $31, %xmm5
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; SSE-NEXT: movdqa %xmm1, %xmm7
; SSE-NEXT: psrad $31, %xmm7
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
; SSE-NEXT: pxor %xmm8, %xmm8
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
; SSE-NEXT: pmuludq %xmm1, %xmm6
; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; SSE-NEXT: pmuludq %xmm0, %xmm7
; SSE-NEXT: paddq %xmm6, %xmm7
; SSE-NEXT: psllq $32, %xmm7
; SSE-NEXT: pmuludq %xmm0, %xmm1
; SSE-NEXT: paddq %xmm7, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
; SSE-NEXT: pmuludq %xmm4, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
; SSE-NEXT: pmuludq %xmm2, %xmm5
; SSE-NEXT: paddq %xmm3, %xmm5
; SSE-NEXT: psllq $32, %xmm5
; SSE-NEXT: pmuludq %xmm2, %xmm4
; SSE-NEXT: paddq %xmm5, %xmm4
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
; SSE-NEXT: paddd %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrad $31, %xmm4
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: pxor %xmm6, %xmm6
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: psllq $32, %xmm2
; SSE-NEXT: paddq %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm5, %xmm3
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
; SSE-NEXT: pmuludq %xmm5, %xmm4
; SSE-NEXT: psllq $32, %xmm4
; SSE-NEXT: paddq %xmm3, %xmm4
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
; SSE-NEXT: paddd %xmm2, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_self_v4i64_v4i32: