From 481e1fd0a6722af8226ef5fc70bbd13ca8969d8d Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Fri, 31 Oct 2008 00:57:24 +0000 Subject: [PATCH] Use MOVSSmr instead of EXTRACTPSmr in the case of extracting vector element 0 for a store, as it's smaller and faster. llvm-svn: 58483 --- lib/Target/X86/X86ISelLowering.cpp | 8 ++++++-- test/CodeGen/X86/extractps.ll | 12 ++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7e1b7a0c76d..cdf16707813 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4194,11 +4194,15 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, } else if (VT == MVT::f32) { // EXTRACTPS outputs to a GPR32 register which will require a movd to copy // the result back to FR32 register. It's only worth matching if the - // result has a single use which is a store or a bitcast to i32. + // result has a single use which is a store or a bitcast to i32. And in + // the case of a store, it's not worth it if the index is a constant 0, + // because a MOVSSmr can be used instead, which is smaller and faster. if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); - if (User->getOpcode() != ISD::STORE && + if ((User->getOpcode() != ISD::STORE || + (isa(Op.getOperand(1)) && + cast(Op.getOperand(1))->isNullValue())) && (User->getOpcode() != ISD::BIT_CONVERT || User->getValueType(0) != MVT::i32)) return SDValue(); diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll index cc6d92c8d39..484d2c4e5e1 100644 --- a/test/CodeGen/X86/extractps.ll +++ b/test/CodeGen/X86/extractps.ll @@ -1,7 +1,7 @@ ; RUN: llvm-as < %s | llc -march=x86 -mcpu=penryn > %t ; RUN: not grep movd %t -; RUN: not grep movss %t -; RUN: grep {extractps \\\$0, %xmm0, } %t +; RUN: grep {movss %xmm} %t | count 1 +; RUN: grep {extractps \\\$1, %xmm0, } %t | count 1 ; PR2647 external global float, align 16 ; :0 [#uses=2] @@ -14,6 +14,14 @@ define internal void @""() nounwind { store float %4, float* @0, align 16 ret void } +define internal void @""() nounwind { + load float* @0, align 16 ; :1 [#uses=1] + insertelement <4 x float> undef, float %1, i32 1 ; <<4 x float>>:2 [#uses=1] + call <4 x float> @llvm.x86.sse.rsqrt.ss( <4 x float> %2 ) ; <<4 x float>>:3 [#uses=1] + extractelement <4 x float> %3, i32 1 ; :4 [#uses=1] + store float %4, float* @0, align 16 + ret void +} declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone