1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86][SSE] LowerVectorAllZeroTest - add support for >256-bit vectors

Reduce by splitting the vector until we reach the target size for PTEST/MOVMSK_PCMPEQ. There might be some cases where AVX512 can perform this with 512-bit vectors but so far I haven't encountered any such pattern that reaches LowerVectorAllZeroTest.

Prep work for D81547
This commit is contained in:
Simon Pilgrim 2020-06-15 15:30:06 +01:00
parent 35f84c1504
commit 085e5b1dfa
2 changed files with 23 additions and 58 deletions

View File

@ -21366,9 +21366,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
}) &&
"Reduction source vector mismatch");
// Quit if not 128/256-bit vector.
// Quit if less than 128-bits or not splittable to 128/256-bit vector.
EVT VT = VecIns[0].getValueType();
if (!VT.is128BitVector() && !VT.is256BitVector())
if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
return SDValue();
SDLoc DL(Op);
@ -21382,18 +21382,28 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
}
SDValue V = VecIns.back();
// Split down to 128/256-bit vector.
unsigned TestSize = Subtarget.hasAVX()? 256 : 128;
while (VT.getSizeInBits() > TestSize) {
auto Split = DAG.SplitVector(V, DL);
VT = Split.first.getValueType();
V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
}
X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
DL, MVT::i8);
bool UsePTEST = Subtarget.hasSSE41();
if (UsePTEST) {
MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
SDValue V = DAG.getBitcast(TestVT, VecIns.back());
V = DAG.getBitcast(TestVT, V);
return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
}
SDValue Result = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8,
DAG.getBitcast(MVT::v16i8, VecIns.back()),
DAG.getBitcast(MVT::v16i8, V),
getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

View File

@ -148,24 +148,9 @@ define i32 @veccond512(<16 x i32> %input) {
;
; AVX512-LABEL: veccond512:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmovq %xmm2, %rcx
; AVX512-NEXT: orq %rax, %rcx
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vmovq %xmm3, %rax
; AVX512-NEXT: orq %rcx, %rax
; AVX512-NEXT: vmovq %xmm0, %rcx
; AVX512-NEXT: orq %rax, %rcx
; AVX512-NEXT: vpextrq $1, %xmm1, %rax
; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
; AVX512-NEXT: orq %rax, %rdx
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
; AVX512-NEXT: orq %rdx, %rax
; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
; AVX512-NEXT: orq %rax, %rdx
; AVX512-NEXT: orq %rcx, %rdx
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vptest %ymm0, %ymm0
; AVX512-NEXT: je .LBB2_2
; AVX512-NEXT: # %bb.1: # %if-true-block
; AVX512-NEXT: xorl %eax, %eax
@ -283,25 +268,10 @@ define i32 @vectest512(<16 x i32> %input) {
;
; AVX512-LABEL: vectest512:
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmovq %xmm2, %rcx
; AVX512-NEXT: orq %rax, %rcx
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vmovq %xmm3, %rax
; AVX512-NEXT: orq %rcx, %rax
; AVX512-NEXT: vmovq %xmm0, %rcx
; AVX512-NEXT: orq %rax, %rcx
; AVX512-NEXT: vpextrq $1, %xmm1, %rax
; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
; AVX512-NEXT: orq %rax, %rdx
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
; AVX512-NEXT: orq %rdx, %rax
; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
; AVX512-NEXT: orq %rax, %rdx
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: xorl %eax, %eax
; AVX512-NEXT: orq %rcx, %rdx
; AVX512-NEXT: vptest %ymm0, %ymm0
; AVX512-NEXT: setne %al
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@ -410,24 +380,9 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
; AVX512-LABEL: vecsel512:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %edi, %eax
; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovq %xmm1, %rcx
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmovq %xmm2, %rdx
; AVX512-NEXT: orq %rcx, %rdx
; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vmovq %xmm3, %rcx
; AVX512-NEXT: orq %rdx, %rcx
; AVX512-NEXT: vmovq %xmm0, %rdx
; AVX512-NEXT: orq %rcx, %rdx
; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
; AVX512-NEXT: vpextrq $1, %xmm2, %rdi
; AVX512-NEXT: orq %rcx, %rdi
; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
; AVX512-NEXT: orq %rdi, %rcx
; AVX512-NEXT: vpextrq $1, %xmm0, %rdi
; AVX512-NEXT: orq %rcx, %rdi
; AVX512-NEXT: orq %rdx, %rdi
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vptest %ymm0, %ymm0
; AVX512-NEXT: cmovel %esi, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq