[x86] Tweak the rules surrounding 0,0 and 1,1 v2f64 shuffles and add

support for MOVDDUP which is really important for matrix multiply style operations that do lots of non-vector-aligned load and splats. The original motivation was to add support for MOVDDUP as the lack of it regresses matmul_f64_4x4 by 5% or so. However, all of the rules here were somewhat suspicious. First, we should always be using the floating point domain shuffles, regardless of how many copies we have to make as a movapd is *crazy* faster than the domain switching cost on some chips. (Mostly because movapd is crazy cheap.) Because SHUFPD can't do the copy-for-free trick of the PSHUF instructions, there is no need to avoid canonicalizing on UNPCK variants, so do that canonicalizing. This also ensures we have the chance to form MOVDDUP. =] Second, we assume SSE2 support when doing any vector lowering, and given that we should just use UNPCKLPD and UNPCKHPD as they can operate on registers or memory. If vectors get spilled or come from memory at all this is going to allow the load to be folded into the operation. If we want to optimize for encoding size (the only difference, and only a 2 byte difference) it should be done *much* later, likely after RA. llvm-svn: 217332
2024-11-23 19:23:23 +01:00 · 2014-09-07 12:02:14 +00:00 · 2014-09-07 12:02:14 +00:00 · 75df70c921
commit 75df70c921
parent 421418c484
3 changed files with 67 additions and 23 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -19320,26 +19320,42 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
  // Use the float domain if the operand type is a floating point type.
  bool FloatDomain = VT.isFloatingPoint();

-  // If we don't have access to VEX encodings, the generic PSHUF instructions
-  // are preferable to some of the specialized forms despite requiring one more
-  // byte to encode because they can implicitly copy.
+  // For floating point shuffles, we don't have free copies in the shuffle
+  // instructions, so this always makes sense to canonicalize.
  //
-  // IF we *do* have VEX encodings, than we can use shorter, more specific
+  // For integer shuffles, if we don't have access to VEX encodings, the generic
+  // PSHUF instructions are preferable to some of the specialized forms despite
+  // requiring one more byte to encode because they can implicitly copy.
+  //
+  // IF we *do* have VEX encodings, then we can use shorter, more specific
  // shuffle instructions freely as they can copy due to the extra register
  // operand.
-  if (Subtarget->hasAVX()) {
+  if (FloatDomain || Subtarget->hasAVX()) {
    // We have both floating point and integer variants of shuffles that dup
    // either the low or high half of the vector.
    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
      bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
-                                     : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      unsigned Shuffle;
+      // If the input is a floating point, check if we have SSE3 which will let
+      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
+      // option to fold the input operand into even an unaligned memory load.
+      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+      } else {
+        // We model everything else using UNPCK instructions. While MOVLHPS and
+        // MOVHLPS are shorter encodings they cannot accept a memory operand
+        // which overly constrains subsequent lowering.
+        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      }
      if (Depth == 1 && Root->getOpcode() == Shuffle)
        return false; // Nothing to do!
-      MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
+      MVT ShuffleVT = FloatDomain ? MVT::v2f64 : MVT::v2i64;
      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
      DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
      DCI.AddToWorklist(Op.getNode());
      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                    /*AddTo*/ true);
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {

 define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
  ret <2 x double> %shuffle
@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
 }
 define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
+; CHECK-SSE2:         unpckhpd {{.*}} # xmm0 = xmm0[1,1]
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
  ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-;        of a mov?
-;
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
  ret <2 x double> %shuffle
@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
 }
 define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
 ; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2:         unpckhpd {{.*}} # xmm1 = xmm1[1,1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-SSE2-NEXT:    retq
  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
  ret <2 x double> %shuffle
@ -217,3 +217,31 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
  ret <2 x i64> %shuffle
 }
+
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE3:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE3-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE2:         movsd {{.*}}, %xmm0
+; CHECK-SSE2-NEXT:    unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE3:         movddup {{.*}}, %xmm0
+; CHECK-SSE3-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
 define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_0001
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_1000
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: @shuffle_v4f64_2200
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@ -152,7 +152,7 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
-; AVX1-NEXT:    vmovhlps {{.*}} # xmm1 = xmm1[1,1]
+; AVX1-NEXT:    vunpckhpd {{.*}} # xmm1 = xmm1[1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>