diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 41d2899c702..6bb2d1ec9e5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35126,123 +35126,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } -/// Check if a vector extract from a target-specific shuffle of a load can be -/// folded into a single element load. -/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been custom lowered so we need to handle those here. -static SDValue -XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - SDValue InVec = N->getOperand(0); - SDValue EltNo = N->getOperand(1); - EVT EltVT = N->getValueType(0); - - if (!isa(EltNo)) - return SDValue(); - - EVT OriginalVT = InVec.getValueType(); - unsigned NumOriginalElts = OriginalVT.getVectorNumElements(); - - // Peek through bitcasts, don't duplicate a load with other uses. - InVec = peekThroughOneUseBitcasts(InVec); - - EVT CurrentVT = InVec.getValueType(); - if (!CurrentVT.isVector()) - return SDValue(); - - unsigned NumCurrentElts = CurrentVT.getVectorNumElements(); - if ((NumOriginalElts % NumCurrentElts) != 0) - return SDValue(); - - if (!isTargetShuffle(InVec.getOpcode())) - return SDValue(); - - // Don't duplicate a load with other uses. - if (!InVec.hasOneUse()) - return SDValue(); - - SmallVector ShuffleMask; - SmallVector ShuffleOps; - bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, - ShuffleOps, ShuffleMask, UnaryShuffle)) - return SDValue(); - - unsigned Scale = NumOriginalElts / NumCurrentElts; - if (Scale > 1) { - SmallVector ScaledMask; - scaleShuffleMask(Scale, ShuffleMask, ScaledMask); - ShuffleMask = std::move(ScaledMask); - } - assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch"); - - // Select the input vector, guarding against out of range extract vector. - int Elt = cast(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt]; - - if (Idx == SM_SentinelZero) - return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) - : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); - if (Idx == SM_SentinelUndef) - return DAG.getUNDEF(EltVT); - - // Bail if any mask element is SM_SentinelZero - getVectorShuffle below - // won't handle it. - if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; })) - return SDValue(); - - assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) && - "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1]; - - // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = - (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; - - if (LdNode.getOpcode() == ISD::BITCAST) { - // Don't duplicate a load with other uses. - if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0)) - return SDValue(); - - AllowedUses = 1; // only allow 1 load use if we have a bitcast - LdNode = LdNode.getOperand(0); - } - - if (!ISD::isNormalLoad(LdNode.getNode())) - return SDValue(); - - LoadSDNode *LN0 = cast(LdNode); - - if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple()) - return SDValue(); - - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( - EltVT.getTypeForEVT(*DAG.getContext())); - - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) - return SDValue(); - - // All checks match so transform back to vector_shuffle so that DAG combiner - // can finish the job - SDLoc dl(N); - - // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT) - : DAG.getBitcast(OriginalVT, ShuffleOps[1]); - Shuffle = DAG.getVectorShuffle(OriginalVT, dl, - DAG.getBitcast(OriginalVT, ShuffleOps[0]), - Shuffle, ShuffleMask); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, - EltNo); -} - // Helper to peek through bitops/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a . static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { @@ -36496,14 +36379,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, } // TODO - Remove this once we can handle the implicit zero-extension of - // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad, - // combineHorizontalPredicateResult and combineBasicSADPattern. + // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and + // combineBasicSADPattern. return SDValue(); } - if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) - return NewOp; - // Detect mmx extraction of all bits as a i64. It works better as a bitcast. if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() && VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) { diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll index 61062b14180..53b71032429 100644 --- a/test/CodeGen/X86/2011-05-09-loaduse.ll +++ b/test/CodeGen/X86/2011-05-09-loaduse.ll @@ -5,15 +5,21 @@ define float @test(<4 x float>* %A) nounwind { ; X86-LABEL: test: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: flds 12(%eax) -; X86-NEXT: movaps %xmm0, (%eax) +; X86-NEXT: movaps (%eax), %xmm0 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X86-NEXT: xorps %xmm1, %xmm1 +; X86-NEXT: movaps %xmm1, (%eax) +; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax ; X86-NEXT: retl ; ; X64-LABEL: test: ; X64: # %bb.0: # %entry -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: movaps %xmm1, (%rdi) ; X64-NEXT: retq diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index f2d07b1ee1a..76c563e728e 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -9,13 +9,20 @@ define i32 @t(<2 x i64>* %val) nounwind { ; X32-SSE2-LABEL: t: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl 8(%eax), %eax +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] +; X32-SSE2-NEXT: movd %xmm0, %eax ; X32-SSE2-NEXT: retl ; -; X64-LABEL: t: -; X64: # %bb.0: -; X64-NEXT: movl 8(%rdi), %eax -; X64-NEXT: retq +; X64-SSSE3-LABEL: t: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] +; X64-SSSE3-NEXT: movd %xmm0, %eax +; X64-SSSE3-NEXT: retq +; +; X64-AVX-LABEL: t: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: movl 8(%rdi), %eax +; X64-AVX-NEXT: retq %tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1] %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1] %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; [#uses=1] @@ -127,7 +134,8 @@ define float @t6(<8 x float> *%a0) { ; X32-SSE2-NEXT: pushl %eax ; X32-SSE2-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movaps (%eax), %xmm0 +; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpeqss %xmm0, %xmm1 ; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -142,7 +150,7 @@ define float @t6(<8 x float> *%a0) { ; ; X64-SSSE3-LABEL: t6: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -165,3 +173,50 @@ define float @t6(<8 x float> *%a0) { %cond = select i1 %cmp, float 1.000000e+00, float %vecext ret float %cond } + +define void @PR43971(<8 x float> *%a0, float *%a1) { +; X32-SSE2-LABEL: PR43971: +; X32-SSE2: # %bb.0: # %entry +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE2-NEXT: movaps 16(%ecx), %xmm0 +; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X32-SSE2-NEXT: xorps %xmm1, %xmm1 +; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1 +; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: andps %xmm1, %xmm2 +; X32-SSE2-NEXT: andnps %xmm0, %xmm1 +; X32-SSE2-NEXT: orps %xmm2, %xmm1 +; X32-SSE2-NEXT: movss %xmm1, (%eax) +; X32-SSE2-NEXT: retl +; +; X64-SSSE3-LABEL: PR43971: +; X64-SSSE3: # %bb.0: # %entry +; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: xorps %xmm1, %xmm1 +; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1 +; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-SSSE3-NEXT: andps %xmm1, %xmm2 +; X64-SSSE3-NEXT: andnps %xmm0, %xmm1 +; X64-SSSE3-NEXT: orps %xmm2, %xmm1 +; X64-SSSE3-NEXT: movss %xmm1, (%rsi) +; X64-SSSE3-NEXT: retq +; +; X64-AVX-LABEL: PR43971: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 +; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovss %xmm0, (%rsi) +; X64-AVX-NEXT: retq +entry: + %0 = load <8 x float>, <8 x float>* %a0, align 32 + %vecext = extractelement <8 x float> %0, i32 6 + %cmp = fcmp ogt float %vecext, 0.000000e+00 + %1 = load float, float* %a1, align 4 + %cond = select i1 %cmp, float %1, float %vecext + store float %cond, float* %a1, align 4 + ret void +} diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll index 6bef76ee9df..c98553db6c5 100644 --- a/test/CodeGen/X86/insertps-combine.ll +++ b/test/CodeGen/X86/insertps-combine.ll @@ -269,12 +269,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) { define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) { ; SSE-LABEL: extract_lane_insertps_5123: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_5123: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3] ; AVX-NEXT: retq %a1 = load <4 x float>, <4 x float> *%p1 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64) @@ -285,12 +285,13 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) { define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) { ; SSE-LABEL: extract_lane_insertps_6123: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_6123: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] ; AVX-NEXT: retq %a1 = load <4 x float>, <4 x float> *%p1 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128) diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll index 3fb669dd45c..2d52bec473a 100644 --- a/test/CodeGen/X86/vec_extract.ll +++ b/test/CodeGen/X86/vec_extract.ll @@ -57,13 +57,15 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind { ; X32: # %bb.0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movaps (%ecx), %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-NEXT: movss %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test3: ; X64: # %bb.0: # %entry -; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movaps (%rsi), %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: movss %xmm0, (%rdi) ; X64-NEXT: retq entry: