1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

[DAG] Improve legalization of INSERT_SUBVECTOR

When the index is known to be constant 0, insert directly into the the low half,
instead of spilling, performing the insert in-memory, and reloading.

Differential Revision: http://reviews.llvm.org/D20763

llvm-svn: 271428
This commit is contained in:
Michael Kuperstein 2016-06-01 20:49:35 +00:00
parent 2092f44163
commit 1e7dd66dfc
2 changed files with 55 additions and 115 deletions

View File

@ -851,15 +851,34 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
SDLoc dl(N);
GetSplitVector(Vec, Lo, Hi);
// Spill the vector to the stack.
EVT VecVT = Vec.getValueType();
EVT SubVecVT = VecVT.getVectorElementType();
EVT VecElemVT = VecVT.getVectorElementType();
unsigned VecElems = VecVT.getVectorNumElements();
unsigned SubElems = SubVec.getValueType().getVectorNumElements();
// If we know the index is 0, and we know the subvector doesn't cross the
// boundary between the halves, we can avoid spilling the vector, and insert
// into the lower half of the split vector directly.
// TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
// the index is constant and there is no boundary crossing. But those cases
// don't seem to get hit in practice.
if (ConstantSDNode *ConstIdx = dyn_cast<ConstantSDNode>(Idx)) {
unsigned IdxVal = ConstIdx->getZExtValue();
if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
return;
}
}
// Spill the vector to the stack.
SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
MachinePointerInfo(), false, false, 0);
// Store the new subvector into the specified index.
SDValue SubVecPtr = GetVectorElementPointer(StackPtr, SubVecVT, Idx);
SDValue SubVecPtr = GetVectorElementPointer(StackPtr, VecElemVT, Idx);
Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo(),

View File

@ -11,52 +11,31 @@
define i32 @sad_16i8() nounwind {
; SSE2-LABEL: sad_16i8:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: movq %rsp, %rbp
; SSE2-NEXT: andq $-64, %rsp
; SSE2-NEXT: subq $128, %rsp
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB0_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: movdqu a+1024(%rax), %xmm5
; SSE2-NEXT: movdqu b+1024(%rax), %xmm0
; SSE2-NEXT: movdqa %xmm4, (%rsp)
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp)
; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp)
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp)
; SSE2-NEXT: psadbw %xmm5, %xmm0
; SSE2-NEXT: paddd %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm0, (%rsp)
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2
; SSE2-NEXT: movdqu a+1024(%rax), %xmm2
; SSE2-NEXT: movdqu b+1024(%rax), %xmm3
; SSE2-NEXT: psadbw %xmm2, %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB0_1
; SSE2-NEXT: # BB#2: # %middle.block
; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: movq %rbp, %rsp
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_16i8:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-64, %rsp
; AVX2-NEXT: subq $128, %rsp
; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
@ -64,25 +43,19 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: .LBB0_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2
; AVX2-NEXT: vmovdqa %ymm0, (%rsp)
; AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm1
; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, (%rsp)
; AVX2-NEXT: vmovdqa (%rsp), %ymm0
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB0_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@ -311,52 +284,32 @@ define i32 @sad_32i8() nounwind {
;
; AVX2-LABEL: sad_32i8:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-128, %rsp
; AVX2-NEXT: subq $256, %rsp # imm = 0x100
; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB1_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm4
; AVX2-NEXT: vmovdqa %ymm0, (%rsp)
; AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm4, %ymm1
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rsp)
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB1_1
; AVX2-NEXT: # BB#2: # %middle.block
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_32i8:
; AVX512F: # BB#0: # %entry
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-128, %rsp
; AVX512F-NEXT: subq $256, %rsp # imm = 0x100
; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
@ -364,17 +317,13 @@ define i32 @sad_32i8() nounwind {
; AVX512F-NEXT: .LBB1_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512F-NEXT: vmovdqa32 %zmm0, (%rsp)
; AVX512F-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm1
; AVX512F-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rsp)
; AVX512F-NEXT: vmovdqa32 {{[0-9]+}}(%rsp), %zmm1
; AVX512F-NEXT: vmovdqa32 (%rsp), %zmm0
; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB1_1
; AVX512F-NEXT: # BB#2: # %middle.block
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
@ -386,16 +335,10 @@ define i32 @sad_32i8() nounwind {
; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_32i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: pushq %rbp
; AVX512BW-NEXT: movq %rsp, %rbp
; AVX512BW-NEXT: andq $-128, %rsp
; AVX512BW-NEXT: subq $256, %rsp # imm = 0x100
; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
@ -403,17 +346,13 @@ define i32 @sad_32i8() nounwind {
; AVX512BW-NEXT: .LBB1_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512BW-NEXT: vmovdqa32 %zmm0, (%rsp)
; AVX512BW-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm1
; AVX512BW-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsp)
; AVX512BW-NEXT: vmovdqa32 {{[0-9]+}}(%rsp), %zmm1
; AVX512BW-NEXT: vmovdqa32 (%rsp), %zmm0
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2
; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB1_1
; AVX512BW-NEXT: # BB#2: # %middle.block
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
@ -425,8 +364,6 @@ define i32 @sad_32i8() nounwind {
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: movq %rbp, %rsp
; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: retq
entry:
br label %vector.body
@ -888,35 +825,21 @@ define i32 @sad_avx64i8() nounwind {
;
; AVX512BW-LABEL: sad_avx64i8:
; AVX512BW: # BB#0: # %entry
; AVX512BW-NEXT: pushq %rbp
; AVX512BW-NEXT: movq %rsp, %rbp
; AVX512BW-NEXT: andq $-256, %rsp
; AVX512BW-NEXT: subq $512, %rsp # imm = 0x200
; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpxord %zmm3, %zmm3, %zmm3
; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB2_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm4
; AVX512BW-NEXT: vmovdqa32 %zmm0, (%rsp)
; AVX512BW-NEXT: vmovdqa32 %zmm2, {{[0-9]+}}(%rsp)
; AVX512BW-NEXT: vmovdqa32 %zmm3, {{[0-9]+}}(%rsp)
; AVX512BW-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm4, %zmm1
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqa32 %zmm0, (%rsp)
; AVX512BW-NEXT: vmovdqa32 {{[0-9]+}}(%rsp), %zmm1
; AVX512BW-NEXT: vmovdqa32 {{[0-9]+}}(%rsp), %zmm3
; AVX512BW-NEXT: vmovdqa32 {{[0-9]+}}(%rsp), %zmm2
; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2
; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB2_1
; AVX512BW-NEXT: # BB#2: # %middle.block
; AVX512BW-NEXT: vpaddd %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
@ -928,8 +851,6 @@ define i32 @sad_avx64i8() nounwind {
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: movq %rbp, %rsp
; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: retq
entry:
br label %vector.body