mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
[x86] Tweak the rules surrounding 0,0 and 1,1 v2f64 shuffles and add
support for MOVDDUP which is really important for matrix multiply style operations that do lots of non-vector-aligned load and splats. The original motivation was to add support for MOVDDUP as the lack of it regresses matmul_f64_4x4 by 5% or so. However, all of the rules here were somewhat suspicious. First, we should always be using the floating point domain shuffles, regardless of how many copies we have to make as a movapd is *crazy* faster than the domain switching cost on some chips. (Mostly because movapd is crazy cheap.) Because SHUFPD can't do the copy-for-free trick of the PSHUF instructions, there is no need to avoid canonicalizing on UNPCK variants, so do that canonicalizing. This also ensures we have the chance to form MOVDDUP. =] Second, we assume SSE2 support when doing any vector lowering, and given that we should just use UNPCKLPD and UNPCKHPD as they can operate on registers or memory. If vectors get spilled or come from memory at all this is going to allow the load to be folded into the operation. If we want to optimize for encoding size (the only difference, and only a 2 byte difference) it should be done *much* later, likely after RA. llvm-svn: 217332
This commit is contained in:
parent
421418c484
commit
75df70c921
@ -19320,26 +19320,42 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
|
||||
// Use the float domain if the operand type is a floating point type.
|
||||
bool FloatDomain = VT.isFloatingPoint();
|
||||
|
||||
// If we don't have access to VEX encodings, the generic PSHUF instructions
|
||||
// are preferable to some of the specialized forms despite requiring one more
|
||||
// byte to encode because they can implicitly copy.
|
||||
// For floating point shuffles, we don't have free copies in the shuffle
|
||||
// instructions, so this always makes sense to canonicalize.
|
||||
//
|
||||
// IF we *do* have VEX encodings, than we can use shorter, more specific
|
||||
// For integer shuffles, if we don't have access to VEX encodings, the generic
|
||||
// PSHUF instructions are preferable to some of the specialized forms despite
|
||||
// requiring one more byte to encode because they can implicitly copy.
|
||||
//
|
||||
// IF we *do* have VEX encodings, then we can use shorter, more specific
|
||||
// shuffle instructions freely as they can copy due to the extra register
|
||||
// operand.
|
||||
if (Subtarget->hasAVX()) {
|
||||
if (FloatDomain || Subtarget->hasAVX()) {
|
||||
// We have both floating point and integer variants of shuffles that dup
|
||||
// either the low or high half of the vector.
|
||||
if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
|
||||
bool Lo = Mask.equals(0, 0);
|
||||
unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
|
||||
: (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
|
||||
unsigned Shuffle;
|
||||
// If the input is a floating point, check if we have SSE3 which will let
|
||||
// us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
|
||||
// option to fold the input operand into even an unaligned memory load.
|
||||
if (FloatDomain && Lo && Subtarget->hasSSE3()) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
} else {
|
||||
// We model everything else using UNPCK instructions. While MOVLHPS and
|
||||
// MOVHLPS are shorter encodings they cannot accept a memory operand
|
||||
// which overly constrains subsequent lowering.
|
||||
Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
|
||||
}
|
||||
if (Depth == 1 && Root->getOpcode() == Shuffle)
|
||||
return false; // Nothing to do!
|
||||
MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
|
||||
MVT ShuffleVT = FloatDomain ? MVT::v2f64 : MVT::v2i64;
|
||||
Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
|
||||
DCI.AddToWorklist(Op.getNode());
|
||||
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
|
||||
if (Shuffle == X86ISD::MOVDDUP)
|
||||
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
|
||||
else
|
||||
Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
|
||||
DCI.AddToWorklist(Op.getNode());
|
||||
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
|
||||
/*AddTo*/ true);
|
||||
|
@ -1,4 +1,5 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-unknown"
|
||||
@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
|
||||
|
||||
define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
|
||||
; CHECK-SSE2-LABEL: @shuffle_v2f64_00
|
||||
; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; CHECK-SSE2: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
|
||||
ret <2 x double> %shuffle
|
||||
@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
|
||||
}
|
||||
define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
|
||||
; CHECK-SSE2-LABEL: @shuffle_v2f64_11
|
||||
; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1]
|
||||
; CHECK-SSE2: unpckhpd {{.*}} # xmm0 = xmm0[1,1]
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
|
||||
ret <2 x double> %shuffle
|
||||
}
|
||||
define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
|
||||
; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
|
||||
; of a mov?
|
||||
;
|
||||
; CHECK-SSE2-LABEL: @shuffle_v2f64_22
|
||||
; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
|
||||
; CHECK-SSE2: unpcklpd {{.*}} # xmm1 = xmm1[0,0]
|
||||
; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
|
||||
ret <2 x double> %shuffle
|
||||
@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
|
||||
}
|
||||
define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
|
||||
; CHECK-SSE2-LABEL: @shuffle_v2f64_33
|
||||
; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
|
||||
; CHECK-SSE2: unpckhpd {{.*}} # xmm1 = xmm1[1,1]
|
||||
; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
|
||||
ret <2 x double> %shuffle
|
||||
@ -217,3 +217,31 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
|
||||
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
|
||||
ret <2 x i64> %shuffle
|
||||
}
|
||||
|
||||
|
||||
define <2 x double> @insert_dup_reg_v2f64(double %a) {
|
||||
; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
|
||||
; CHECK-SSE2: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
;
|
||||
; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
|
||||
; CHECK-SSE3: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; CHECK-SSE3-NEXT: retq
|
||||
%v = insertelement <2 x double> undef, double %a, i32 0
|
||||
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
|
||||
ret <2 x double> %shuffle
|
||||
}
|
||||
define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
|
||||
; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
|
||||
; CHECK-SSE2: movsd {{.*}}, %xmm0
|
||||
; CHECK-SSE2-NEXT: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
;
|
||||
; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
|
||||
; CHECK-SSE3: movddup {{.*}}, %xmm0
|
||||
; CHECK-SSE3-NEXT: retq
|
||||
%a = load double* %ptr
|
||||
%v = insertelement <2 x double> undef, double %a, i32 0
|
||||
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
|
||||
ret <2 x double> %shuffle
|
||||
}
|
||||
|
@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
|
||||
define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: @shuffle_v4f64_0001
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm0[0,0]
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
|
||||
@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
|
||||
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
|
||||
@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
|
||||
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
|
||||
@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: @shuffle_v4f64_1000
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1,0]
|
||||
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
|
||||
@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1-LABEL: @shuffle_v4f64_2200
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm1[0,0]
|
||||
; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
|
||||
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
|
||||
@ -152,7 +152,7 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
|
||||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
|
||||
; AVX1-NEXT: vmovhlps {{.*}} # xmm1 = xmm1[1,1]
|
||||
; AVX1-NEXT: vunpckhpd {{.*}} # xmm1 = xmm1[1,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
|
||||
|
Loading…
Reference in New Issue
Block a user