mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
fix the buildvector->insertp[sd] logic to not always create a redundant
insertp[sd] $0, which is a noop. Before: _f32: ## @f32 pshufd $1, %xmm1, %xmm2 pshufd $1, %xmm0, %xmm3 addss %xmm2, %xmm3 addss %xmm1, %xmm0 ## kill: XMM0<def> XMM0<kill> XMM0<def> insertps $0, %xmm0, %xmm0 insertps $16, %xmm3, %xmm0 ret after: _f32: ## @f32 movdqa %xmm0, %xmm2 addss %xmm1, %xmm2 pshufd $1, %xmm1, %xmm1 pshufd $1, %xmm0, %xmm3 addss %xmm1, %xmm3 movdqa %xmm2, %xmm0 insertps $16, %xmm3, %xmm0 ret The extra movs are due to a random (poor) scheduling decision. llvm-svn: 112379
This commit is contained in:
parent
c3b630d64b
commit
8cb4abbc0e
@ -4278,14 +4278,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
|
|||||||
if (LD.getNode())
|
if (LD.getNode())
|
||||||
return LD;
|
return LD;
|
||||||
|
|
||||||
// For SSE 4.1, use inserts into undef.
|
// For SSE 4.1, use insertps to put the high elements into the low element.
|
||||||
if (getSubtarget()->hasSSE41()) {
|
if (getSubtarget()->hasSSE41()) {
|
||||||
V[0] = DAG.getUNDEF(VT);
|
SDValue Result;
|
||||||
for (unsigned i = 0; i < NumElems; ++i)
|
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
|
||||||
if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
|
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
|
||||||
V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
|
else
|
||||||
|
Result = DAG.getUNDEF(VT);
|
||||||
|
|
||||||
|
for (unsigned i = 1; i < NumElems; ++i) {
|
||||||
|
if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
|
||||||
|
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
|
||||||
Op.getOperand(i), DAG.getIntPtrConstant(i));
|
Op.getOperand(i), DAG.getIntPtrConstant(i));
|
||||||
return V[0];
|
}
|
||||||
|
return Result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise, expand into a number of unpckl*, start by extending each of
|
// Otherwise, expand into a number of unpckl*, start by extending each of
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {7 machine-licm}
|
; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {6 machine-licm}
|
||||||
; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 | FileCheck %s
|
; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 | FileCheck %s
|
||||||
; rdar://6627786
|
; rdar://6627786
|
||||||
; rdar://7792037
|
; rdar://7792037
|
||||||
|
@ -224,3 +224,28 @@ declare i32 @llvm.x86.sse41.ptestz(<4 x float>, <4 x float>) nounwind readnone
|
|||||||
declare i32 @llvm.x86.sse41.ptestc(<4 x float>, <4 x float>) nounwind readnone
|
declare i32 @llvm.x86.sse41.ptestc(<4 x float>, <4 x float>) nounwind readnone
|
||||||
declare i32 @llvm.x86.sse41.ptestnzc(<4 x float>, <4 x float>) nounwind readnone
|
declare i32 @llvm.x86.sse41.ptestnzc(<4 x float>, <4 x float>) nounwind readnone
|
||||||
|
|
||||||
|
; This used to compile to insertps $0 + insertps $16. insertps $0 is always
|
||||||
|
; pointless.
|
||||||
|
define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
|
||||||
|
entry:
|
||||||
|
%tmp7 = extractelement <2 x float> %A, i32 0
|
||||||
|
%tmp5 = extractelement <2 x float> %A, i32 1
|
||||||
|
%tmp3 = extractelement <2 x float> %B, i32 0
|
||||||
|
%tmp1 = extractelement <2 x float> %B, i32 1
|
||||||
|
%add.r = fadd float %tmp7, %tmp3
|
||||||
|
%add.i = fadd float %tmp5, %tmp1
|
||||||
|
%tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
|
||||||
|
%tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
|
||||||
|
ret <2 x float> %tmp9
|
||||||
|
; X32: buildvector:
|
||||||
|
; X32-NOT: insertps $0
|
||||||
|
; X32: insertps $16
|
||||||
|
; X32-NOT: insertps $0
|
||||||
|
; X32: ret
|
||||||
|
; X64: buildvector:
|
||||||
|
; X64-NOT: insertps $0
|
||||||
|
; X64: insertps $16
|
||||||
|
; X64-NOT: insertps $0
|
||||||
|
; X64: ret
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
|
; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
|
||||||
; RUN: grep pinsrd %t | count 2
|
; RUN: grep pinsrd %t | count 1
|
||||||
|
|
||||||
define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
|
define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
|
||||||
entry:
|
entry:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user