mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 04:02:41 +01:00
~40% faster vector shl <4 x i32> on SSE 4.1 Larger improvements for smaller types coming in future patches.
For: define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { entry: %shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1] %tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp2 } We get: _shl: ## @shl pslld $23, %xmm1 paddd LCPI0_0, %xmm1 cvttps2dq %xmm1, %xmm1 pmulld %xmm1, %xmm0 ret Instead of: _shl: ## @shl pshufd $3, %xmm0, %xmm2 movd %xmm2, %eax pshufd $3, %xmm1, %xmm2 movd %xmm2, %ecx shll %cl, %eax movd %eax, %xmm2 pshufd $1, %xmm0, %xmm3 movd %xmm3, %eax pshufd $1, %xmm1, %xmm3 movd %xmm3, %ecx shll %cl, %eax movd %eax, %xmm3 punpckldq %xmm2, %xmm3 movd %xmm0, %eax movd %xmm1, %ecx shll %cl, %eax movd %eax, %xmm2 movhlps %xmm0, %xmm0 movd %xmm0, %eax movhlps %xmm1, %xmm1 movd %xmm1, %ecx shll %cl, %eax movd %eax, %xmm0 punpckldq %xmm0, %xmm2 movdqa %xmm2, %xmm0 punpckldq %xmm3, %xmm0 ret llvm-svn: 109549
This commit is contained in:
parent
8c197e5dbb
commit
068e932975
@ -838,6 +838,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
||||
// FIXME: Do we need to handle scalar-to-vector here?
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
|
||||
|
||||
// Can turn SHL into an integer multiply.
|
||||
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
|
||||
|
||||
// i8 and i16 vectors are custom , because the source register and source
|
||||
// source memory operand types are not the same width. f32 vectors are
|
||||
// custom since the immediate controlling the insert encodes additional
|
||||
@ -7498,6 +7501,35 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
|
||||
return Res;
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
|
||||
EVT VT = Op.getValueType();
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
SDValue R = Op.getOperand(0);
|
||||
|
||||
assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
|
||||
assert(VT == MVT::v4i32 && "Only know how to lower v4i32");
|
||||
|
||||
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
|
||||
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
|
||||
Op.getOperand(1), DAG.getConstant(23, MVT::i32));
|
||||
|
||||
std::vector<Constant*> CV;
|
||||
LLVMContext *Context = DAG.getContext();
|
||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
||||
CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U)));
|
||||
Constant *C = ConstantVector::get(CV);
|
||||
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
|
||||
SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
|
||||
PseudoSourceValue::getConstantPool(), 0,
|
||||
false, false, 16);
|
||||
|
||||
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
|
||||
Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op);
|
||||
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
|
||||
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
|
||||
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
|
||||
@ -7730,6 +7762,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
|
||||
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
|
||||
case ISD::SHL: return LowerSHL(Op, DAG);
|
||||
case ISD::SADDO:
|
||||
case ISD::UADDO:
|
||||
case ISD::SSUBO:
|
||||
|
@ -723,6 +723,7 @@ namespace llvm {
|
||||
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
14
test/CodeGen/X86/vec_shift4.ll
Normal file
14
test/CodeGen/X86/vec_shift4.ll
Normal file
@ -0,0 +1,14 @@
|
||||
; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
|
||||
|
||||
define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
|
||||
entry:
|
||||
; CHECK-NOT: shll
|
||||
; CHECK: pslld
|
||||
; CHECK: paddd
|
||||
; CHECK: cvttps2dq
|
||||
; CHECK: pmulld
|
||||
|
||||
%shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1]
|
||||
%tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]
|
||||
ret <2 x i64> %tmp2
|
||||
}
|
Loading…
Reference in New Issue
Block a user