1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 03:23:01 +02:00

[X86] Improve lowering of v2i64 sign bit tests on pre-sse4.2 targets

Without sse4.2 a v2i64 setlt needs to expand into a pcmpgtd, pcmpeqd, 3 shuffles, and 2 logic ops. But if we're only interested in the sign bit of the i64 elements, we can just use one pcmpgtd and shuffle the odd elements to the even elements.

Differential Revision: https://reviews.llvm.org/D72302
This commit is contained in:
Craig Topper 2020-01-07 11:08:45 -08:00
parent f330417d5f
commit 8548edec3f
6 changed files with 977 additions and 1470 deletions

View File

@ -21584,6 +21584,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
assert(Subtarget.hasSSE2() && "Don't know how to lower!");
// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
// the odd elements over the even elements.
if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
Op0 = DAG.getConstant(0, dl, MVT::v4i32);
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
static const int MaskHi[] = { 1, 1, 3, 3 };
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
return DAG.getBitcast(VT, Result);
}
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.

View File

@ -429,48 +429,11 @@ define i16 @bitcast_v32i8_to_v2i16(<32 x i8> %a0) nounwind {
define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4:
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm3, %xmm5
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm7, %xmm2
; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm2
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm1, %xmm3
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm1
; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm1
; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm1
; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax
; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
; SSE2-SSSE3-NEXT: movzbl %al, %ecx
; SSE2-SSSE3-NEXT: shrl $4, %ecx
; SSE2-SSSE3-NEXT: movq %rcx, %xmm0

View File

@ -1015,48 +1015,11 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
define i1 @allones_v8i64_sign(<8 x i64> %arg) {
; SSE2-LABEL: allones_v8i64_sign:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: packssdw %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: packsswb %xmm0, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: cmpb $-1, %al
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
@ -1113,48 +1076,11 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {
define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {
; SSE2-LABEL: allzeros_v8i64_sign:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: movdqa %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
; SSE2-NEXT: pand %xmm6, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm2
; SSE2-NEXT: packssdw %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm4, %xmm3
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: packsswb %xmm0, %xmm1
; SSE2-NEXT: pmovmskb %xmm1, %eax
; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: packsswb %xmm0, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %eax
; SSE2-NEXT: testb %al, %al
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -810,27 +810,22 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pxor %xmm2, %xmm3
; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: pcmpgtd %xmm0, %xmm4
; SSE-NEXT: pcmpgtd %xmm2, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm3, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: pand %xmm5, %xmm0
; SSE-NEXT: pcmpeqd %xmm3, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE-NEXT: pand %xmm5, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE-NEXT: por %xmm0, %xmm3
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
; SSE-NEXT: pcmpeqd %xmm2, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pxor %xmm3, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: por %xmm2, %xmm3
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pcmpgtd %xmm1, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: saddo_v2i64: