From 4d1399fbdd4d4b9e862d723026f6ccc4211c537d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 19 Mar 2018 17:31:41 +0000 Subject: [PATCH] [X86] Remove sse41 specific code from lowering v16i8 multiply With the SRAs removed from the SSE2 code in D44267, then there doesn't appear to be any advantage to the sse41 code. The punpcklbw instruction and pmovsx seem to have the same latency and throughput on most CPUs. And the SSE41 code requires moving the upper 64-bits into the lower 64-bit before the sign extend can be done. The unpckhbw in sse2 code can do better than that. llvm-svn: 327869 --- lib/Target/X86/X86ISelLowering.cpp | 48 ++---- test/CodeGen/X86/combine-mul.ll | 13 +- test/CodeGen/X86/pmul.ll | 191 ++++++++++------------- test/CodeGen/X86/vector-idiv-sdiv-128.ll | 22 ++- test/CodeGen/X86/vector-idiv-sdiv-256.ll | 16 +- test/CodeGen/X86/vector-idiv-udiv-128.ll | 22 ++- test/CodeGen/X86/vector-idiv-udiv-256.ll | 74 +++++---- test/CodeGen/X86/vector-mul.ll | 153 +++++++++--------- 8 files changed, 246 insertions(+), 293 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f0981687a64..67f1a313f29 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -22449,42 +22449,26 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, MVT ExVT = MVT::v8i16; // Extract the lo parts and sign extend to i16 - SDValue ALo, BLo; - if (Subtarget.hasSSE41()) { - ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT); - BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT); - } else { - // We're going to mask off the low byte of each result element of the - // pmullw, so it doesn't matter what's in the high byte of each 16-bit - // element. - const int ShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1, + // We're going to mask off the low byte of each result element of the + // pmullw, so it doesn't matter what's in the high byte of each 16-bit + // element. + const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1}; - ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); - BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - ALo = DAG.getBitcast(ExVT, ALo); - BLo = DAG.getBitcast(ExVT, BLo); - } + SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask); + SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask); + ALo = DAG.getBitcast(ExVT, ALo); + BLo = DAG.getBitcast(ExVT, BLo); // Extract the hi parts and sign extend to i16 - SDValue AHi, BHi; - if (Subtarget.hasSSE41()) { - const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15, - -1, -1, -1, -1, -1, -1, -1, -1}; - AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); - BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT); - BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT); - } else { - // We're going to mask off the low byte of each result element of the - // pmullw, so it doesn't matter what's in the high byte of each 16-bit - // element. - const int ShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1, + // We're going to mask off the low byte of each result element of the + // pmullw, so it doesn't matter what's in the high byte of each 16-bit + // element. + const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1}; - AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); - BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getBitcast(ExVT, AHi); - BHi = DAG.getBitcast(ExVT, BHi); - } + SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask); + SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask); + AHi = DAG.getBitcast(ExVT, AHi); + BHi = DAG.getBitcast(ExVT, BHi); // Multiply, mask the lower 8bits of the lo/hi results and pack SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); diff --git a/test/CodeGen/X86/combine-mul.ll b/test/CodeGen/X86/combine-mul.ll index 4a0d3df969d..78278409ebd 100644 --- a/test/CodeGen/X86/combine-mul.ll +++ b/test/CodeGen/X86/combine-mul.ll @@ -289,14 +289,15 @@ define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) { define <16 x i8> @PR35579(<16 x i8> %x) { ; SSE-LABEL: PR35579: ; SSE: # %bb.0: -; SSE-NEXT: pmovsxbw %xmm0, %xmm1 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pmullw %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 8d898d95c95..6355a1d60ce 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -22,15 +22,14 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v16i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] -; SSE41-NEXT: pmullw %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: pmullw %xmm2, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: packuswb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -156,17 +155,15 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 -; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 ; SSE41-NEXT: packuswb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -401,24 +398,22 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; ; SSE41-LABEL: mul_v32i8c: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] -; SSE41-NEXT: pmullw %xmm4, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; SSE41-NEXT: pmullw %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pmullw %xmm4, %xmm2 +; SSE41-NEXT: pand %xmm5, %xmm2 ; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm5, %xmm1 ; SSE41-NEXT: pmullw %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm3 +; SSE41-NEXT: packuswb %xmm1, %xmm3 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm3, %xmm1 ; SSE41-NEXT: retq @@ -583,28 +578,24 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v32i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 -; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; SSE41-NEXT: pmullw %xmm5, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: pmullw %xmm5, %xmm4 +; SSE41-NEXT: pand %xmm6, %xmm4 ; SSE41-NEXT: packuswb %xmm0, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm0 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm6, %xmm1 ; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: packuswb %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: movdqa %xmm2, %xmm1 @@ -795,40 +786,36 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117] -; SSE41-NEXT: pmullw %xmm6, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; SSE41-NEXT: pmullw %xmm6, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm7, %xmm1 +; SSE41-NEXT: pmullw %xmm6, %xmm0 ; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: pmullw %xmm6, %xmm1 -; SSE41-NEXT: pand %xmm7, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pmovsxbw %xmm4, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: pmullw %xmm6, %xmm1 ; SSE41-NEXT: pand %xmm7, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 -; SSE41-NEXT: pmullw %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: packuswb %xmm4, %xmm1 -; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 -; SSE41-NEXT: pmullw %xmm6, %xmm4 -; SSE41-NEXT: pand %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: pmullw %xmm6, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 ; SSE41-NEXT: packuswb %xmm2, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm6, %xmm3 +; SSE41-NEXT: pand %xmm7, %xmm3 ; SSE41-NEXT: pmullw %xmm6, %xmm5 ; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pand %xmm7, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm5 +; SSE41-NEXT: packuswb %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm5, %xmm3 ; SSE41-NEXT: retq @@ -963,50 +950,42 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movdqa %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovsxbw %xmm4, %xmm9 -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pmullw %xmm9, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm4, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm9, %xmm1 +; SSE41-NEXT: pmullw %xmm10, %xmm0 ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm9, %xmm1 ; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pmovsxbw %xmm5, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm8, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm5, %xmm8 +; SSE41-NEXT: pand %xmm9, %xmm8 ; SSE41-NEXT: pmullw %xmm4, %xmm1 ; SSE41-NEXT: pand %xmm9, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 -; SSE41-NEXT: pmullw %xmm4, %xmm5 -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: packuswb %xmm5, %xmm1 -; SSE41-NEXT: pmovsxbw %xmm6, %xmm5 -; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 +; SSE41-NEXT: packuswb %xmm8, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm6, %xmm2 +; SSE41-NEXT: pand %xmm9, %xmm2 ; SSE41-NEXT: pmullw %xmm5, %xmm4 ; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pmullw %xmm5, %xmm2 -; SSE41-NEXT: pand %xmm9, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm7, %xmm2 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm9, %xmm3 ; SSE41-NEXT: pmullw %xmm2, %xmm5 ; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm9, %xmm3 ; SSE41-NEXT: packuswb %xmm3, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm2 ; SSE41-NEXT: movdqa %xmm5, %xmm3 diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 126e3906c29..2416a177228 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -552,15 +552,14 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: psrlw $7, %xmm1 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; SSE41-NEXT: pmullw %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: packuswb %xmm1, %xmm2 ; SSE41-NEXT: psubb %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -585,16 +584,15 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll index a9d5976ee7d..c112e84fbf7 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -473,16 +473,15 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 ; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 @@ -500,14 +499,13 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll index 8af5067c097..c991a905c05 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -521,15 +521,14 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: paddb %xmm1, %xmm2 ; SSE41-NEXT: psrlw $2, %xmm2 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE41-NEXT: pmovsxbw %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] -; SSE41-NEXT: pmullw %xmm3, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: pmullw %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm1 ; SSE41-NEXT: packuswb %xmm2, %xmm1 ; SSE41-NEXT: psubb %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -551,16 +550,15 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll index 602f050935d..81d93984e26 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -463,59 +463,57 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind { define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-LABEL: test_rem7_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37] -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpmullw %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_32i8: diff --git a/test/CodeGen/X86/vector-mul.ll b/test/CodeGen/X86/vector-mul.ll index 9c4112639de..15861353d16 100644 --- a/test/CodeGen/X86/vector-mul.ll +++ b/test/CodeGen/X86/vector-mul.ll @@ -325,43 +325,40 @@ define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind { define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind { ; X86-LABEL: mul_v16i8_17: ; X86: # %bb.0: -; X86-NEXT: pmovsxbw %xmm0, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] -; X86-NEXT: pmullw %xmm2, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-NEXT: pmovsxbw %xmm0, %xmm0 +; X86-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17] ; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X86-NEXT: pand %xmm3, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 +; X86-NEXT: pand %xmm3, %xmm1 ; X86-NEXT: packuswb %xmm0, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i8_17: ; X64: # %bb.0: -; X64-NEXT: pmovsxbw %xmm0, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] -; X64-NEXT: pmullw %xmm2, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: pmovsxbw %xmm0, %xmm0 +; X64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17] ; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X64-NEXT: pand %xmm3, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 +; X64-NEXT: pand %xmm3, %xmm1 ; X64-NEXT: packuswb %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_17: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1 -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17] ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0 +; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14] +; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v16i8_17: @@ -460,40 +457,43 @@ define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind { define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind { ; X86-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: ; X86: # %bb.0: -; X86-NEXT: pmovsxbw %xmm0, %xmm1 -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] +; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmullw %xmm2, %xmm0 ; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm2, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-NEXT: pmovsxbw %xmm0, %xmm0 -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm1 +; X86-NEXT: pand %xmm2, %xmm1 ; X86-NEXT: packuswb %xmm0, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: ; X64: # %bb.0: -; X64-NEXT: pmovsxbw %xmm0, %xmm1 -; X64-NEXT: pmullw {{.*}}(%rip), %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] +; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmullw %xmm2, %xmm0 ; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm2, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: pmovsxbw %xmm0, %xmm0 -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 ; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pmullw {{.*}}(%rip), %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 ; X64-NEXT: packuswb %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1 -; X64-XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0 +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3] +; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-XOP-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14] +; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3: @@ -598,43 +598,40 @@ define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind { define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind { ; X86-LABEL: mul_v16i8_31: ; X86: # %bb.0: -; X86-NEXT: pmovsxbw %xmm0, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31] -; X86-NEXT: pmullw %xmm2, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-NEXT: pmovsxbw %xmm0, %xmm0 +; X86-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X86-NEXT: pand %xmm3, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 +; X86-NEXT: pand %xmm3, %xmm1 ; X86-NEXT: packuswb %xmm0, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i8_31: ; X64: # %bb.0: -; X64-NEXT: pmovsxbw %xmm0, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31] -; X64-NEXT: pmullw %xmm2, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: pmovsxbw %xmm0, %xmm0 +; X64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; X64-NEXT: pand %xmm3, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 +; X64-NEXT: pand %xmm3, %xmm1 ; X64-NEXT: packuswb %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_31: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1 -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31] +; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0 +; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14] +; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v16i8_31: @@ -967,43 +964,43 @@ define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind { define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind { ; X86-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: ; X86: # %bb.0: -; X86-NEXT: pmovsxbw %xmm0, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] -; X86-NEXT: pmullw %xmm2, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X86-NEXT: pand %xmm3, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X86-NEXT: pmovsxbw %xmm0, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] +; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X86-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X86-NEXT: pmullw %xmm2, %xmm0 -; X86-NEXT: pand %xmm3, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X86-NEXT: pand %xmm2, %xmm0 +; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm1 +; X86-NEXT: pand %xmm2, %xmm1 ; X86-NEXT: packuswb %xmm0, %xmm1 ; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: ; X64: # %bb.0: -; X64-NEXT: pmovsxbw %xmm0, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] -; X64-NEXT: pmullw %xmm2, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; X64-NEXT: pand %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-NEXT: pmovsxbw %xmm0, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] +; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X64-NEXT: pmullw %xmm2, %xmm0 -; X64-NEXT: pand %xmm3, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pmullw {{.*}}(%rip), %xmm1 +; X64-NEXT: pand %xmm2, %xmm1 ; X64-NEXT: packuswb %xmm0, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: ; X64-XOP: # %bb.0: -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1 -; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127] -; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0 -; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14] +; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127] +; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X64-XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; X64-XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-XOP-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] ; X64-XOP-NEXT: retq ; ; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127: