1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[X86] When lowering insert_vector_elt/extract_vector_elt of vXi1 with a non-constant index just use either a 128-bit type or the vXi8 type with the correct number of elements.

Despite what the comment said there isn't better codegen for 512-bit vectors. The 128/256/512 bit implementation jus stores to memory and loads an element. There's no advantage to doing that with a larger size. In fact in many cases it causes a stack realignment and generates worse code.

llvm-svn: 321369
This commit is contained in:
Craig Topper 2017-12-22 17:18:11 +00:00
parent 15d0e14230
commit acd88472c6
2 changed files with 36 additions and 70 deletions

View File

@ -14578,11 +14578,10 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
unsigned NumElts = VecVT.getVectorNumElements();
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
unsigned VecSize = (NumElts <= 4 ? 128 : 512);
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
ExtVT.getVectorElementType(), Ext, Idx);
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
}
@ -14777,9 +14776,8 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
// Non constant index. Extend source and destination,
// insert element and then truncate the result.
unsigned NumElts = VecVT.getVectorNumElements();
unsigned VecSize = (NumElts <= 4 ? 128 : 512);
MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
MVT ExtEltVT = ExtVecVT.getVectorElementType();
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

View File

@ -1616,45 +1616,28 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v8i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 %zmm0, (%rsp)
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $7, %edi
; KNL-NEXT: movzbl (%rsp,%rdi,8), %eax
; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v8i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $7, %edi
; SKX-NEXT: movzbl (%rsp,%rdi,8), %eax
; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <8 x i32> %a, %b
@ -1666,43 +1649,28 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v16i1:
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm0, (%rsp)
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; KNL-NEXT: andl $15, %edi
; KNL-NEXT: movzbl (%rsp,%rdi,4), %eax
; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; KNL-NEXT: movzbl (%rdi,%rax), %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v16i1:
; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; SKX-NEXT: andl $15, %edi
; SKX-NEXT: movzbl (%rsp,%rdi,4), %eax
; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; SKX-NEXT: movzbl (%rdi,%rax), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%t1 = icmp ugt <16 x i32> %a, %b
@ -1743,14 +1711,15 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
; SKX-NEXT: vpmovm2b %k0, %ymm0
; SKX-NEXT: vmovdqa %ymm0, (%rsp)
; SKX-NEXT: andl $31, %edi
; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rsp, %rax
; SKX-NEXT: movzbl (%rdi,%rax), %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp
@ -1816,20 +1785,19 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: andq $-32, %rsp
; SKX-NEXT: subq $64, %rsp
; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
; SKX-NEXT: xorl %eax, %eax
; SKX-NEXT: testb %dil, %dil
; SKX-NEXT: setne %al
; SKX-NEXT: vpmovm2w %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
; SKX-NEXT: andl $31, %esi
; SKX-NEXT: movw %ax, (%rsp,%rsi,2)
; SKX-NEXT: vpsllw $15, (%rsp), %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: testb %dil, %dil
; SKX-NEXT: vpmovm2b %k0, %ymm0
; SKX-NEXT: vmovdqa %ymm0, (%rsp)
; SKX-NEXT: movq %rsp, %rax
; SKX-NEXT: setne (%rsi,%rax)
; SKX-NEXT: vpsllw $7, (%rsp), %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: movq %rbp, %rsp
; SKX-NEXT: popq %rbp