From e119c6afaf8c44c42e23f2f367cb26fe4e3152a4 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Sat, 17 Jan 2015 01:35:56 +0000 Subject: [PATCH] Improve DAG combine pass on certain IR vector patterns Loading 2 2x32-bit float vectors into the bottom half of a 256-bit vector produced suboptimal code in AVX2 mode with certain IR combinations. In particular, the IR optimizer folded 2f32 + 2f32 -> 4f32, 4f32 + 4f32 (undef) -> 8f32 into a 2f32 + 2f32 -> 8f32, which seems more canonical, but then mysteriously generated rather bad code; the movq/movhpd combination didn't match. The problem lay in the BUILD_VECTOR optimization path. The 2f32 inputs would get promoted to 4f32 by the type legalizer, eventually resulting in a BUILD_VECTOR on two 4f32 into an 8f32. The BUILD_VECTOR then, recognizing these were both half the output size, concatted them and then produced a shuffle. However, the resulting concat + shuffle was more complex than it should be; in the case where the upper half of the output is undef, we probably want to generate shuffle + concat instead. This enhancement causes the vector_shuffle combine step to recognize this suboptimal pattern and correct it. I included it there instead of in BUILD_VECTOR in case the same suboptimal pattern occurs for other reasons. This results in the optimizer correctly producing the optimal movq + movhpd sequence for all three variations on this IR, even with AVX2. I've included a test case. Radar link: rdar://problem/19287012 Fix for PR 21943. From: Fiona Glaser llvm-svn: 226360 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 15 +++++++- test/CodeGen/X86/vector-shuffle-256-v8.ll | 42 +++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5145731f623..3bde9918793 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11347,7 +11347,8 @@ static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0, return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask()); } -// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat. +// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, +// or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -11361,6 +11362,18 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; + // Special case: shuffle(concat(A,B)) can be more efficiently represented + // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high + // half vector elements. + if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF && + std::all_of(SVN->getMask().begin() + NumElemsPerConcat, + SVN->getMask().end(), [](int i) { return i == -1; })) { + N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), + ArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); + N1 = DAG.getUNDEF(ConcatVT); + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); + } + // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 77903da3558..e4bd4c4f817 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1849,3 +1849,45 @@ define <8 x float> @splat_v8f32(<4 x float> %r) { %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer ret <8 x float> %1 } + +define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) { +; ALL-LABEL: concat_v2f32_1: +; ALL: # BB#0: # %entry +; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp74 = load <2 x float>* %tmp65, align 8 + %tmp72 = load <2 x float>* %tmp64, align 8 + %tmp73 = shufflevector <2 x float> %tmp72, <2 x float> undef, <8 x i32> + %tmp75 = shufflevector <2 x float> %tmp74, <2 x float> undef, <8 x i32> + %tmp76 = shufflevector <8 x float> %tmp73, <8 x float> %tmp75, <8 x i32> + ret <8 x float> %tmp76 +} + +define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) { +; ALL-LABEL: concat_v2f32_2: +; ALL: # BB#0: # %entry +; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp74 = load <2 x float>* %tmp65, align 8 + %tmp72 = load <2 x float>* %tmp64, align 8 + %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <8 x i32> + ret <8 x float> %tmp76 +} + +define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) { +; ALL-LABEL: concat_v2f32_3: +; ALL: # BB#0: # %entry +; ALL-NEXT: vmovq (%rdi), %xmm0 +; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; ALL-NEXT: retq +entry: + %tmp74 = load <2 x float>* %tmp65, align 8 + %tmp72 = load <2 x float>* %tmp64, align 8 + %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <4 x i32> + %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> + ret <8 x float> %res +}