mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
[X86][AVX] combineX86ShuffleChain - combine shuffle(extractsubvector(x),extractsubvector(y))
We already handle the case where we combine shuffle(extractsubvector(x),extractsubvector(x)), this relaxes the requirement to permit different sources as long as they have the same value type. This causes a couple of cases where the VPERMV3 binary shuffles occur at a wider width than before, which I intend to improve in future commits - but as only the subvector's mask indices are defined, these will broadcast so we don't see any increase in constant size. llvm-svn: 362599
This commit is contained in:
parent
4e860add7f
commit
a9ea2c1dd3
@ -32091,19 +32091,28 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
||||
isa<ConstantSDNode>(V2.getOperand(1))) {
|
||||
SDValue Src1 = V1.getOperand(0);
|
||||
SDValue Src2 = V2.getOperand(0);
|
||||
if (Src1 == Src2) {
|
||||
if (Src1.getValueType() == Src2.getValueType()) {
|
||||
unsigned Offset1 = V1.getConstantOperandVal(1);
|
||||
unsigned Offset2 = V2.getConstantOperandVal(1);
|
||||
assert(((Offset1 % VT1.getVectorNumElements()) == 0 ||
|
||||
(Offset2 % VT2.getVectorNumElements()) == 0 ||
|
||||
(Src1.getValueSizeInBits() % RootSizeInBits) == 0) &&
|
||||
"Unexpected subvector extraction");
|
||||
unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
|
||||
|
||||
// Convert extraction indices to mask size.
|
||||
Offset1 /= VT1.getVectorNumElements();
|
||||
Offset2 /= VT2.getVectorNumElements();
|
||||
Offset1 *= NumMaskElts;
|
||||
Offset2 *= NumMaskElts;
|
||||
|
||||
SmallVector<SDValue, 2> NewInputs;
|
||||
NewInputs.push_back(Src1);
|
||||
if (Src1 != Src2) {
|
||||
NewInputs.push_back(Src2);
|
||||
Offset2 += Scale * NumMaskElts;
|
||||
}
|
||||
|
||||
// Create new mask for larger type.
|
||||
SmallVector<int, 64> NewMask(Mask);
|
||||
for (int &M : NewMask) {
|
||||
@ -32114,10 +32123,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
||||
else
|
||||
M = (M - NumMaskElts) + Offset2;
|
||||
}
|
||||
unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
|
||||
NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
|
||||
|
||||
SDValue NewInputs[] = {Src1};
|
||||
if (SDValue Res = combineX86ShuffleChain(
|
||||
NewInputs, Src1, NewMask, Depth, HasVariableMask,
|
||||
AllowVariableMask, DAG, Subtarget)) {
|
||||
|
@ -1805,12 +1805,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp,
|
||||
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
|
||||
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,3]
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2]
|
||||
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
|
||||
; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1
|
||||
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
|
||||
; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
|
||||
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
|
||||
; CHECK-NEXT: vpermd %ymm3, %ymm1, %ymm1
|
||||
; CHECK-NEXT: vpermt2d %ymm0, %ymm2, %ymm1
|
||||
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
|
||||
@ -3128,10 +3128,9 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %v
|
||||
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
|
||||
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
|
||||
; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
|
||||
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14]
|
||||
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4
|
||||
; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4
|
||||
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
|
||||
; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
|
||||
@ -3146,12 +3145,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
|
||||
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
|
||||
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
|
||||
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
|
||||
; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14]
|
||||
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
|
||||
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
|
||||
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
|
||||
; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
|
||||
; CHECK-NEXT: vmovaps %xmm2, %xmm0
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
|
||||
|
@ -11,45 +11,45 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
|
||||
; CHECK-NEXT: subq $72, %rsp
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 80
|
||||
; CHECK-NEXT: vmovaps %xmm1, %xmm8
|
||||
; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
|
||||
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3]
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3]
|
||||
; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3]
|
||||
; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3]
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3]
|
||||
; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0]
|
||||
; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3]
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0]
|
||||
; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
|
||||
; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
|
||||
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
|
||||
; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
|
||||
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
|
||||
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
|
||||
; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
|
||||
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
|
||||
|
@ -2994,8 +2994,9 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
|
||||
;
|
||||
; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
|
||||
; AVX512VLVBMI: # %bb.0:
|
||||
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
|
||||
; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
|
||||
; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32,15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32]
|
||||
; AVX512VLVBMI-NEXT: # ymm2 = mem[0,1,0,1]
|
||||
; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
|
||||
; AVX512VLVBMI-NEXT: retq
|
||||
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
ret <32 x i8> %shuffle
|
||||
|
Loading…
Reference in New Issue
Block a user