1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86] Prefer VPBLENDD for v2i64/v4i64 blends with AVX2.

We were using VPBLENDW for v2i64 and VBLENDPD for v4i64. VPBLENDD has better throughput than VPBLENDW on some CPUs so it makes sense to use it when possible. VBLENDPD will probably become VBLENDD during execution domain fixing, but we might as well use integer in isel while we can.

This should work around some issues with the domain fixing pass prefering PBLENDW when we start with PBLENDW. There may still be some v8i16 cases that could use PBLENDD.

llvm-svn: 355281
This commit is contained in:
Craig Topper 2019-03-03 00:18:07 +00:00
parent 8b536c8004
commit b10ff17264
10 changed files with 207 additions and 83 deletions

View File

@ -6419,6 +6419,17 @@ def BlendScaleImm2 : SDNodeXForm<imm, [{
return getI8Imm(NewImm, SDLoc(N));
}]>;
// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
def BlendScaleImm2to4 : SDNodeXForm<imm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
if (Imm & (1 << i))
NewImm |= 0x3 << (i * 2);
}
return getI8Imm(NewImm, SDLoc(N));
}]>;
// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
uint8_t Imm = N->getZExtValue();
@ -6441,6 +6452,17 @@ def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
return getI8Imm(NewImm ^ 0xff, SDLoc(N));
}]>;
// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
uint8_t Imm = N->getZExtValue();
uint8_t NewImm = 0;
for (unsigned i = 0; i != 2; ++i) {
if (Imm & (1 << i))
NewImm |= 0x3 << (i * 2);
}
return getI8Imm(NewImm ^ 0xf, SDLoc(N));
}]>;
let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@ -6553,7 +6575,7 @@ let Predicates = [HasAVX2] in {
// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
// ExecutionDomainFixPass will cleanup domains later on.
let Predicates = [HasAVX] in {
let Predicates = [HasAVX1Only] in {
def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
(VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
@ -6569,9 +6591,7 @@ def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
(VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
(VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
}
let Predicates = [HasAVX1Only] in {
def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
(VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
@ -7867,6 +7887,20 @@ defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
SchedWriteBlend.YMM, VR256, i256mem,
BlendCommuteImm8>, VEX_L;
def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
(VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
(VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
(VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
(VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
(VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
(VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
}
// For insertion into the zero index (low half) of a 256-bit vector, it is

View File

@ -1913,7 +1913,7 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x
; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],mem[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
@ -1928,7 +1928,7 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2
; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@ -2564,7 +2564,7 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],mem[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
@ -2579,7 +2579,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq

View File

@ -1580,7 +1580,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsubq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
@ -1593,7 +1593,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpaddq %xmm2, %xmm0, %xmm2
; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@ -1606,7 +1606,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
; AVX512BW-NEXT: vmovq %rax, %xmm2
; AVX512BW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsravq %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: retq
;
; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
@ -2498,7 +2498,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
@ -2508,7 +2508,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
@ -2580,7 +2580,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
@ -2591,7 +2591,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
@ -2665,7 +2665,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
@ -2676,7 +2676,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
@ -2949,12 +2949,26 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_nonuniform7:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
; AVX2ORLATER: # %bb.0:
; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2ORLATER-NEXT: retq
;
; XOP-LABEL: combine_vec_sdiv_nonuniform7:
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; XOP-NEXT: retq
%1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
ret <8 x i16> %1
}

View File

@ -30,11 +30,23 @@ define <2 x i64> @insert_v2i64_x1(<2 x i64> %a) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2i64_x1:
; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: insert_v2i64_x1:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v2i64_x1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: insert_v2i64_x1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: retq
%1 = insertelement <2 x i64> %a, i64 -1, i32 0
ret <2 x i64> %1
}

View File

@ -43,12 +43,19 @@ define <8 x i16> @test2(<8 x i16> %a) {
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: test2:
; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: test2:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test2:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
%lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
ret <8 x i16> %lshr
}
@ -136,12 +143,19 @@ define <8 x i16> @test6(<8 x i16> %a) {
; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: test6:
; AVX: # %bb.0:
; AVX-NEXT: vpsraw $2, %xmm0, %xmm1
; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: test6:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1
; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
%lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
ret <8 x i16> %lshr
}

View File

@ -19,14 +19,14 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: vmovaps 80(%rbp), %ymm13
; CHECK-NEXT: vmovaps 48(%rbp), %ymm14
; CHECK-NEXT: vmovaps 16(%rbp), %ymm15
; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
; CHECK-NEXT: vmovaps %xmm9, %xmm6
; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: # implicit-def: $ymm0
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0]
; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
; CHECK-NEXT: vmovaps %xmm2, %xmm6
; CHECK-NEXT: # implicit-def: $ymm2
; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
@ -34,18 +34,18 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm6[0],zero
; CHECK-NEXT: # implicit-def: $ymm11
; CHECK-NEXT: vmovaps %xmm6, %xmm11
; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
; CHECK-NEXT: vmovaps %xmm7, %xmm6
; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7]
; CHECK-NEXT: # implicit-def: $ymm11
; CHECK-NEXT: vmovaps %xmm6, %xmm11
; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3]
; CHECK-NEXT: vblendpd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3]
; CHECK-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7]
; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3]
; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5]
; CHECK-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3]
; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: vmovaps %ymm5, %ymm1
; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill

View File

@ -694,14 +694,23 @@ define <4 x i32> @PR19721(<4 x i32> %i) {
; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: PR19721:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovq %xmm0, %rax
; X64-AVX-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; X64-AVX-NEXT: andq %rax, %rcx
; X64-AVX-NEXT: vmovq %rcx, %xmm1
; X64-AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; X64-AVX-NEXT: retq
; X64-AVX1-LABEL: PR19721:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovq %xmm0, %rax
; X64-AVX1-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; X64-AVX1-NEXT: andq %rax, %rcx
; X64-AVX1-NEXT: vmovq %rcx, %xmm1
; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; X64-AVX1-NEXT: retq
;
; X64-AVX512-LABEL: PR19721:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; X64-AVX512-NEXT: andq %rax, %rcx
; X64-AVX512-NEXT: vmovq %rcx, %xmm1
; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; X64-AVX512-NEXT: retq
%bc = bitcast <4 x i32> %i to i128
%insert = and i128 %bc, -4294967296
%bc2 = bitcast i128 %insert to <4 x i32>

View File

@ -112,13 +112,29 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39893:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: PR39893:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR39893:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR39893:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
%shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> <i32 10, i32 4>

View File

@ -2136,11 +2136,17 @@ define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_reg_lo_v4i32:
; AVX: # %bb.0:
; AVX-NEXT: vmovq %rdi, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: insert_reg_lo_v4i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2OR512VL-NEXT: retq
%a.cast = bitcast i64 %a to <2 x i32>
%v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>

View File

@ -1155,7 +1155,7 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf:
@ -1169,7 +1169,7 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
@ -1266,11 +1266,17 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_032dXXXX:
; AVX: # %bb.0:
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_032dXXXX:
; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
}
@ -1418,11 +1424,17 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_012dcde3:
; AVX: # %bb.0:
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_012dcde3:
; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
ret <8 x i16> %shuffle
}
@ -1549,12 +1561,19 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_XX4X8acX:
; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX-NEXT: retq
; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
ret <8 x i16> %shuffle
}