1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86] Type legalize v2f32 loads by using an f64 load and a scalar_to_vector.

On 64-bit targets the generic legalize will use an i64 load and a scalar_to_vector for us. But on 32-bit targets i64 isn't legal and the generic legalizer will end up emitting two 32-bit loads. We have DAG combines that try to put those two loads back together with pretty good success.

This patch instead uses f64 to avoid the splitting entirely. I've made it do the same for 64-bit mode for consistency and to keep the load in the fp domain.

There are a few things in here that look like regressions in 32-bit mode, but I believe they bring us closer to the 64-bit mode codegen. And that the 64-bit mode code could be better. I think those issues should be looked at separately.

Differential Revision: https://reviews.llvm.org/D52528

llvm-svn: 344291
This commit is contained in:
Craig Topper 2018-10-11 20:36:06 +00:00
parent a021c0c3b8
commit 139c1152ca
8 changed files with 61 additions and 39 deletions

View File

@ -902,6 +902,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target.
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
@ -26420,6 +26424,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
break;
}
case ISD::LOAD: {
// Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids
// scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast
// since type legalization will try to use an i64 load.
EVT VT = N->getValueType(0);
assert(VT == MVT::v2f32 && "Unexpected VT");
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(),
Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res);
Res = DAG.getBitcast(MVT::v4f32, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
}
}
}

View File

@ -17,8 +17,10 @@ define i1 @foo(i64 %a) {
;
; X86-SSE-LABEL: foo:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-SSE-NEXT: ucomiss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-SSE-NEXT: movaps %xmm0, %xmm1
; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
; X86-SSE-NEXT: ucomiss %xmm1, %xmm0
; X86-SSE-NEXT: setp %al
; X86-SSE-NEXT: retl
;

View File

@ -16,8 +16,8 @@ define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
; CHECK-NEXT: movlps %xmm0, (%rsp)
; CHECK-NEXT: movlps %xmm0, (%rsi)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-NEXT: callq ext
; CHECK-NEXT: addq $24, %rsp
; CHECK-NEXT: retq

View File

@ -237,33 +237,35 @@ define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
; AVX1-LABEL: merge_8f32_2f32_23z5:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovups 16(%rdi), %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovups 16(%rdi), %xmm0
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: merge_8f32_2f32_23z5:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vmovupd 16(%rdi), %xmm0
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: merge_8f32_2f32_23z5:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm1
; AVX512F-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: vmovupd 16(%rdi), %xmm0
; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; X32-AVX-LABEL: merge_8f32_2f32_23z5:
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
; X32-AVX-NEXT: vmovups 16(%eax), %xmm0
; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
%ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3

View File

@ -1329,19 +1329,15 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
; X86-AVX1-LABEL: test_mm_loadh_pi:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX1-NEXT: vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
; X86-AVX1-NEXT: # xmm1 = mem[0],zero
; X86-AVX1-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0]
; X86-AVX1-NEXT: vmovhpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x00]
; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512-LABEL: test_mm_loadh_pi:
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX512-NEXT: vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
; X86-AVX512-NEXT: # xmm1 = mem[0],zero
; X86-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0]
; X86-AVX512-NEXT: vmovhpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x00]
; X86-AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
; X86-AVX512-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: test_mm_loadh_pi:
@ -1396,19 +1392,15 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
; X86-AVX1-LABEL: test_mm_loadl_pi:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX1-NEXT: vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
; X86-AVX1-NEXT: # xmm1 = mem[0],zero
; X86-AVX1-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
; X86-AVX1-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3]
; X86-AVX1-NEXT: vmovlpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x00]
; X86-AVX1-NEXT: # xmm0 = mem[0],xmm0[1]
; X86-AVX1-NEXT: retl # encoding: [0xc3]
;
; X86-AVX512-LABEL: test_mm_loadl_pi:
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX512-NEXT: vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
; X86-AVX512-NEXT: # xmm1 = mem[0],zero
; X86-AVX512-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
; X86-AVX512-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3]
; X86-AVX512-NEXT: vmovlpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x00]
; X86-AVX512-NEXT: # xmm0 = mem[0],xmm0[1]
; X86-AVX512-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: test_mm_loadl_pi:

View File

@ -171,7 +171,9 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl

View File

@ -1998,8 +1998,8 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:

View File

@ -5,11 +5,11 @@
; This load should be before the call, not after.
; SSE: movaps compl+128(%rip), %xmm0
; SSE: movsd compl+128(%rip), %xmm0
; SSE: movaps %xmm0, (%rsp)
; SSE: callq killcommon
; AVX: vmovaps compl+128(%rip), %xmm0
; AVX: vmovsd compl+128(%rip), %xmm0
; AVX: vmovaps %xmm0, (%rsp)
; AVX: callq killcommon