1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

[x86] Tweak the rules surrounding 0,0 and 1,1 v2f64 shuffles and add

support for MOVDDUP which is really important for matrix multiply style
operations that do lots of non-vector-aligned load and splats.

The original motivation was to add support for MOVDDUP as the lack of it
regresses matmul_f64_4x4 by 5% or so. However, all of the rules here
were somewhat suspicious.

First, we should always be using the floating point domain shuffles,
regardless of how many copies we have to make as a movapd is *crazy*
faster than the domain switching cost on some chips. (Mostly because
movapd is crazy cheap.) Because SHUFPD can't do the copy-for-free trick
of the PSHUF instructions, there is no need to avoid canonicalizing on
UNPCK variants, so do that canonicalizing. This also ensures we have the
chance to form MOVDDUP. =]

Second, we assume SSE2 support when doing any vector lowering, and given
that we should just use UNPCKLPD and UNPCKHPD as they can operate on
registers or memory. If vectors get spilled or come from memory at all
this is going to allow the load to be folded into the operation. If we
want to optimize for encoding size (the only difference, and only
a 2 byte difference) it should be done *much* later, likely after RA.

llvm-svn: 217332
This commit is contained in:
Chandler Carruth 2014-09-07 12:02:14 +00:00
parent 421418c484
commit 75df70c921
3 changed files with 67 additions and 23 deletions

View File

@ -19320,26 +19320,42 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// Use the float domain if the operand type is a floating point type.
bool FloatDomain = VT.isFloatingPoint();
// If we don't have access to VEX encodings, the generic PSHUF instructions
// are preferable to some of the specialized forms despite requiring one more
// byte to encode because they can implicitly copy.
// For floating point shuffles, we don't have free copies in the shuffle
// instructions, so this always makes sense to canonicalize.
//
// IF we *do* have VEX encodings, than we can use shorter, more specific
// For integer shuffles, if we don't have access to VEX encodings, the generic
// PSHUF instructions are preferable to some of the specialized forms despite
// requiring one more byte to encode because they can implicitly copy.
//
// IF we *do* have VEX encodings, then we can use shorter, more specific
// shuffle instructions freely as they can copy due to the extra register
// operand.
if (Subtarget->hasAVX()) {
if (FloatDomain || Subtarget->hasAVX()) {
// We have both floating point and integer variants of shuffles that dup
// either the low or high half of the vector.
if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
bool Lo = Mask.equals(0, 0);
unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
: (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
unsigned Shuffle;
// If the input is a floating point, check if we have SSE3 which will let
// us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
// option to fold the input operand into even an unaligned memory load.
if (FloatDomain && Lo && Subtarget->hasSSE3()) {
Shuffle = X86ISD::MOVDDUP;
} else {
// We model everything else using UNPCK instructions. While MOVLHPS and
// MOVHLPS are shorter encodings they cannot accept a memory operand
// which overly constrains subsequent lowering.
Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
}
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
MVT ShuffleVT = FloatDomain ? MVT::v2f64 : MVT::v2i64;
Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
if (Shuffle == X86ISD::MOVDDUP)
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
else
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);

View File

@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
; CHECK-SSE2-LABEL: @shuffle_v2f64_00
; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0]
; CHECK-SSE2: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
ret <2 x double> %shuffle
@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
; CHECK-SSE2-LABEL: @shuffle_v2f64_11
; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1]
; CHECK-SSE2: unpckhpd {{.*}} # xmm0 = xmm0[1,1]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
; of a mov?
;
; CHECK-SSE2-LABEL: @shuffle_v2f64_22
; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
; CHECK-SSE2: unpcklpd {{.*}} # xmm1 = xmm1[0,0]
; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
ret <2 x double> %shuffle
@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
; CHECK-SSE2-LABEL: @shuffle_v2f64_33
; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
; CHECK-SSE2: unpckhpd {{.*}} # xmm1 = xmm1[1,1]
; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
ret <2 x double> %shuffle
@ -217,3 +217,31 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
ret <2 x i64> %shuffle
}
define <2 x double> @insert_dup_reg_v2f64(double %a) {
; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
; CHECK-SSE2: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
; CHECK-SSE3: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
; CHECK-SSE3-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
ret <2 x double> %shuffle
}
define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
; CHECK-SSE2: movsd {{.*}}, %xmm0
; CHECK-SSE2-NEXT: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
; CHECK-SSE2-NEXT: retq
;
; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
; CHECK-SSE3: movddup {{.*}}, %xmm0
; CHECK-SSE3-NEXT: retq
%a = load double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
ret <2 x double> %shuffle
}

View File

@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_0001
; AVX1: # BB#0:
; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm0[0,0]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_1000
; AVX1: # BB#0:
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1,0]
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_2200
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm1[0,0]
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@ -152,7 +152,7 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
; AVX1-NEXT: vmovhlps {{.*}} # xmm1 = xmm1[1,1]
; AVX1-NEXT: vunpckhpd {{.*}} # xmm1 = xmm1[1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>