From 3493e43afd484da11e5c03e712b320627c0a8656 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Fri, 9 May 2008 21:53:03 +0000 Subject: [PATCH] Handle a few more cases of folding load i64 into xmm and zero top bits. Note, some of the code will be moved into target independent part of DAG combiner in a subsequent patch. llvm-svn: 50918 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 2 +- lib/Target/X86/X86ISelLowering.cpp | 68 ++++++++++++++++++++++++++---- lib/Target/X86/X86ISelLowering.h | 7 ++- lib/Target/X86/X86InstrMMX.td | 8 ++-- lib/Target/X86/X86InstrSSE.td | 34 +++++++++------ test/CodeGen/X86/vec_set-C.ll | 1 + test/CodeGen/X86/vec_set-F.ll | 19 +++++++++ 7 files changed, 110 insertions(+), 29 deletions(-) create mode 100644 test/CodeGen/X86/vec_set-F.ll diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 1e9a0da9c68..c92e7697385 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -975,7 +975,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred, // Also handle the case where we explicitly require zeros in the top // elements. This is a vector shuffle from the zero vector. - if (N.getOpcode() == X86ISD::ZEXT_VMOVL && N.Val->hasOneUse() && + if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.Val->hasOneUse() && // Check to see if the top elements are all zeros (or bitcast of zeros). N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && N.getOperand(0).Val->hasOneUse() && diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d2441fc1372..e9605bb174b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -715,6 +715,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::STORE); @@ -3481,9 +3482,9 @@ SDOperand RewriteAsNarrowerShuffle(SDOperand V1, SDOperand V2, &MaskVec[0], MaskVec.size())); } -/// getZextVMoveL - Return a zero-extending vector move low node. +/// getVZextMovL - Return a zero-extending vector move low node. /// -static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT, +static SDOperand getVZextMovL(MVT::ValueType VT, MVT::ValueType OpVT, SDOperand SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { @@ -3501,7 +3502,7 @@ static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT, // PR2108 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT, + DAG.getNode(X86ISD::VZEXT_MOVL, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, OpVT, SrcOp.getOperand(0).getOperand(0)))); } @@ -3509,7 +3510,7 @@ static SDOperand getZextVMoveL(MVT::ValueType VT, MVT::ValueType OpVT, } return DAG.getNode(ISD::BIT_CONVERT, VT, - DAG.getNode(X86ISD::ZEXT_VMOVL, OpVT, + DAG.getNode(X86ISD::VZEXT_MOVL, OpVT, DAG.getNode(ISD::BIT_CONVERT, OpVT, SrcOp))); } @@ -3561,14 +3562,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { SDOperand NewMask = NewOp.getOperand(2); if (isCommutedMOVL(NewMask.Val, true, false)) { NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG); - return getZextVMoveL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget); + return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget); } } } else if (ISD::isBuildVectorAllZeros(V1.Val)) { SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this); if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val)) - return getZextVMoveL(VT, NewOp.getValueType(), NewOp.getOperand(1), + return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), DAG, Subtarget); } } @@ -3577,7 +3578,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { if (V1IsUndef) return V2; if (ISD::isBuildVectorAllZeros(V1.Val)) - return getZextVMoveL(VT, VT, V2, DAG, Subtarget); + return getVZextMovL(VT, VT, V2, DAG, Subtarget); return Op; } @@ -5675,7 +5676,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; - case X86ISD::ZEXT_VMOVL: return "X86ISD::ZEXT_VMOVL"; + case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; + case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; } } @@ -6302,6 +6304,55 @@ static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, LD->getAlignment()); } +static SDNode *getBuildPairElt(SDNode *N, unsigned i) { + SDOperand Elt = N->getOperand(i); + if (Elt.getOpcode() != ISD::MERGE_VALUES) + return Elt.Val; + return Elt.getOperand(Elt.ResNo).Val; +} + +static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Ignore single operand BUILD_VECTOR. + if (N->getNumOperands() == 1) + return SDOperand(); + + MVT::ValueType VT = N->getValueType(0); + MVT::ValueType EVT = MVT::getVectorElementType(VT); + if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit()) + // We are looking for load i64 and zero extend. We want to transform + // it before legalizer has a chance to expand it. Also look for i64 + // BUILD_PAIR bit casted to f64. + return SDOperand(); + // This must be an insertion into a zero vector. + SDOperand HighElt = N->getOperand(1); + if (HighElt.getOpcode() != ISD::UNDEF && + !isZeroNode(HighElt)) + return SDOperand(); + + // Value must be a load. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + SDNode *Base = N->getOperand(0).Val; + if (!isa(Base)) { + if (Base->getOpcode() == ISD::BIT_CONVERT) + Base = Base->getOperand(0).Val; + if (Base->getOpcode() != ISD::BUILD_PAIR) + return SDOperand(); + SDNode *Pair = Base; + Base = getBuildPairElt(Pair, 0); + if (!ISD::isNON_EXTLoad(Base)) + return SDOperand(); + SDNode *NextLD = getBuildPairElt(Pair, 1); + if (!ISD::isNON_EXTLoad(NextLD) || + !isConsecutiveLoad(NextLD, Base, 1, 4/*32 bits*/, MFI)) + return SDOperand(); + } + LoadSDNode *LD = cast(Base); + + // Transform it into VZEXT_LOAD addr. + return DAG.getNode(X86ISD::VZEXT_LOAD, VT, LD->getChain(), LD->getBasePtr()); +} + /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -6498,6 +6549,7 @@ SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget); + case ISD::BUILD_VECTOR: return PerformBuildVectorCombine(N, DAG, Subtarget); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case X86ISD::FXOR: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index a6556b7695a..f42ff3960d3 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -201,8 +201,11 @@ namespace llvm { // FNSTCW16m - Store FP control world into i16 memory. FNSTCW16m, - // ZEXT_VMOVL - Vector move low and zero extend. - ZEXT_VMOVL + // VZEXT_MOVL - Vector move low and zero extend. + VZEXT_MOVL, + + // VZEXT_LOAD - Load, scalar_to_vector, and zero extend. + VZEXT_LOAD }; } diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 3c1fc750a14..42f19af1f8b 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -201,12 +201,12 @@ let AddedComplexity = 15 in def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (v2i32 (X86zvmovl (v2i32 (scalar_to_vector GR32:$src)))))]>; + (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>; let AddedComplexity = 20 in def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (v2i32 (X86zvmovl (v2i32 + (v2i32 (X86vzmovl (v2i32 (scalar_to_vector (loadi32 addr:$src))))))]>; // Arithmetic Instructions @@ -560,9 +560,9 @@ def : Pat<(i64 (bitconvert (v8i8 VR64:$src))), // Move scalar to XMM zero-extended // movd to XMM register zero-extends let AddedComplexity = 15 in { - def : Pat<(v8i8 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))), + def : Pat<(v8i8 (X86vzmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))), (MMX_MOVZDI2PDIrr GR32:$src)>; - def : Pat<(v4i16 (X86zvmovl (bc_v8i8 (v2i32 (scalar_to_vector GR32:$src))))), + def : Pat<(v4i16 (X86vzmovl (bc_v4i16 (v2i32 (scalar_to_vector GR32:$src))))), (MMX_MOVZDI2PDIrr GR32:$src)>; } diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index c56d4c59e06..12e8b7066ff 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -47,7 +47,10 @@ def X86pinsrw : SDNode<"X86ISD::PINSRW", def X86insrtps : SDNode<"X86ISD::INSERTPS", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, SDTCisVT<2, f32>, SDTCisPtrTy<3>]>>; -def X86zvmovl : SDNode<"X86ISD::ZEXT_VMOVL", SDTUnaryOp>; +def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; +def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad]>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -1008,10 +1011,10 @@ let neverHasSideEffects = 1 in let AddedComplexity = 20 in def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src), "movss\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v4f32 (X86zvmovl (v4f32 (scalar_to_vector + [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))))]>; -def : Pat<(v4f32 (X86zvmovl (memopv4f32 addr:$src))), +def : Pat<(v4f32 (X86vzmovl (memopv4f32 addr:$src))), (MOVZSS2PSrm addr:$src)>; //===----------------------------------------------------------------------===// @@ -2266,22 +2269,23 @@ let AddedComplexity = 20 in def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "movsd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86zvmovl (v2f64 (scalar_to_vector + (v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))))]>; -def : Pat<(v2f64 (X86zvmovl (memopv2f64 addr:$src))), +def : Pat<(v2f64 (X86vzmovl (memopv2f64 addr:$src))), (MOVZSD2PDrm addr:$src)>; +def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>; // movd / movq to XMM register zero-extends let AddedComplexity = 15 in { def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v4i32 (X86zvmovl + [(set VR128:$dst, (v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))))]>; // This is X86-64 only. def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86zvmovl + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))))]>; } @@ -2289,28 +2293,30 @@ let AddedComplexity = 20 in { def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v4i32 (X86zvmovl (v4i32 (scalar_to_vector + (v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>; def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2i64 (X86zvmovl (v2i64 (scalar_to_vector + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, XS, Requires<[HasSSE2]>; } +def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; + // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86zvmovl (v2i64 VR128:$src))))]>, + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, XS, Requires<[HasSSE2]>; let AddedComplexity = 20 in def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movq\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (v2i64 (X86zvmovl + [(set VR128:$dst, (v2i64 (X86vzmovl (memopv2i64 addr:$src))))]>, XS, Requires<[HasSSE2]>; @@ -2758,9 +2764,9 @@ let Predicates = [HasSSE2] in { // movd to XMM register zero-extends let AddedComplexity = 15 in { // Zeroing a VR128 then do a MOVS{S|D} to the lower bits. -def : Pat<(v2f64 (X86zvmovl (v2f64 (scalar_to_vector FR64:$src)))), +def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>; -def : Pat<(v4f32 (X86zvmovl (v4f32 (scalar_to_vector FR32:$src)))), +def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>; } @@ -2916,7 +2922,7 @@ let AddedComplexity = 15 in def : Pat<(v2f64 (vector_shuffle immAllZerosV_bc, VR128:$src, MOVL_shuffle_mask)), (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; -def : Pat<(v2f64 (X86zvmovl (v2f64 VR128:$src))), +def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; // FIXME: Temporary workaround since 2-wide shuffle is broken. diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll index eef9a61ab94..fc86853e10f 100644 --- a/test/CodeGen/X86/vec_set-C.ll +++ b/test/CodeGen/X86/vec_set-C.ll @@ -1,4 +1,5 @@ ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep mov | count 1 ; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep movd define <2 x i64> @t1(i64 %x) nounwind { diff --git a/test/CodeGen/X86/vec_set-F.ll b/test/CodeGen/X86/vec_set-F.ll new file mode 100644 index 00000000000..db83eb2e853 --- /dev/null +++ b/test/CodeGen/X86/vec_set-F.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movq +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movsd +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep mov | count 3 + +define <2 x i64> @t1(<2 x i64>* %ptr) nounwind { + %tmp45 = bitcast <2 x i64>* %ptr to <2 x i32>* + %tmp615 = load <2 x i32>* %tmp45 + %tmp7 = bitcast <2 x i32> %tmp615 to i64 + %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %tmp7, i32 0 + ret <2 x i64> %tmp8 +} + +define <2 x i64> @t2(i64 %x) nounwind { + %tmp717 = bitcast i64 %x to double + %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0 + %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1 + %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64> + ret <2 x i64> %tmp11 +}