From 7491f1f32f43989969223730401ab545afd795a6 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 21 Nov 2014 14:33:24 +0000 Subject: [PATCH] [x86] Make the previous logic significantly less conservative and get a bunch more improvements. Non-lane-crossing is fine, the key is that lane merging only makes sense for single-input shuffles. Not sure why I got so turned around here. The code all works, I was just using the wrong model for it. This only updates v4 and v8 lowering. The v16 and v32 lowering requires restructuring the entire check sequence. llvm-svn: 222537 --- lib/Target/X86/X86ISelLowering.cpp | 24 +++---- test/CodeGen/X86/vector-shuffle-256-v4.ll | 28 +++----- test/CodeGen/X86/vector-shuffle-256-v8.ll | 84 +++++++---------------- 3 files changed, 42 insertions(+), 94 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 36873cf6241..76f35070c63 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9999,8 +9999,8 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, static SDValue lowerVectorShuffleByMerging128BitLanes( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(is128BitLaneCrossingShuffleMask(VT, Mask) && - "This is only useful when there are cross-128-bit-lane shuffles."); + assert(!isSingleInputShuffleMask(Mask) && + "This is only useful with multiple inputs."); int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); @@ -10170,8 +10170,7 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && - !(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) @@ -10251,8 +10250,7 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single // instruction so skip this pattern. - if (is128BitLaneCrossingShuffleMask(MVT::v4i64, Mask) && - !(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)))) if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) @@ -10337,10 +10335,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( - DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) - return Result; + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Result; // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. @@ -10407,10 +10404,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. - if (is128BitLaneCrossingShuffleMask(MVT::v8i32, Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( - DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) - return Result; + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Result; // Otherwise fall back on generic blend lowering. return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 19f3c7743ca..720c4c633ae 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -331,20 +331,11 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_1076: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_1076: -; AVX2: # BB#0: -; AVX2-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] -; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v4f64_1076: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -708,17 +699,14 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1076: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_1076: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index d04adf9f7e8..77903da3558 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -746,20 +746,11 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { -; AVX1-LABEL: shuffle_v8f32_3210fedc: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v8f32_3210fedc: -; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v8f32_3210fedc: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -785,39 +776,21 @@ define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { -; AVX1-LABEL: shuffle_v8f32_ba987654: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v8f32_ba987654: -; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v8f32_ba987654: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { -; AVX1-LABEL: shuffle_v8f32_ba983210: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v8f32_ba983210: -; AVX2: # BB#0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v8f32_ba983210: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -1774,17 +1747,14 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_3210fedc: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_3210fedc: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1825,17 +1795,14 @@ define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_ba987654: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_ba987654: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1844,17 +1811,14 @@ define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_ba983210: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_ba983210: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle