1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 20:23:11 +01:00
llvm-mirror/test/CodeGen/X86/avx-splat.ll
Simon Pilgrim 6f240f4b49 [X86][SSE] Improve i16 splatting shuffles
Better handling of the annoying pshuflw/pshufhw ops which only shuffle lower/upper halves of a vector.

Added vXi16 unary shuffle support for cases where i16 elements (from the same half of the source) are being splatted to the whole of one of the halves. This avoids the general lowering case which must shuffle the 32-bit elements first - meaning that we used to end up with unnecessary duplicate pshuflw/pshufhw shuffles.

Note this has the side effect of a lot of SSSE3 test cases no longer needing to use PSHUFB, as it falls below the 3 op combine threshold for when PSHUFB is typically worth it. I've raised PR26183 to discuss if the threshold should be changed and whether we need to make it more specific to the target CPU.

Differential Revision: http://reviews.llvm.org/D14901

llvm-svn: 258440
2016-01-21 22:07:41 +00:00

176 lines
7.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcA:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <32 x i8> %shuffle
}
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i16> %shuffle
}
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcC:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmovq %rdi, %xmm0
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
ret <4 x i64> %vecinit6.i
}
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcD:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
ret <4 x double> %vecinit6.i
}
; Test this turns into a broadcast:
; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
;
define <8 x float> @funcE() nounwind {
; CHECK-LABEL: funcE:
; CHECK: ## BB#0: ## %for_exit499
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ## implicit-def: %YMM0
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne LBB4_2
; CHECK-NEXT: ## BB#1: ## %load.i1247
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: andq $-32, %rsp
; CHECK-NEXT: subq $1312, %rsp ## imm = 0x520
; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %ymm0
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: LBB4_2: ## %__load_and_broadcast_32.exit1249
; CHECK-NEXT: retq
allocas:
%udx495 = alloca [18 x [18 x float]], align 32
br label %for_test505.preheader
for_test505.preheader: ; preds = %for_test505.preheader, %allocas
br i1 undef, label %for_exit499, label %for_test505.preheader
for_exit499: ; preds = %for_test505.preheader
br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247
load.i1247: ; preds = %for_exit499
%ptr1227 = getelementptr [18 x [18 x float]], [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
%ptr.i1237 = bitcast float* %ptr1227 to i32*
%val.i1238 = load i32, i32* %ptr.i1237, align 4
%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
br label %__load_and_broadcast_32.exit1249
__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499
%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
ret <8 x float> %load_broadcast12281250
}
define <8 x float> @funcF(i32 %val) nounwind {
; CHECK-LABEL: funcF:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovd %edi, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
%tmp = bitcast <8 x i32> %ret7 to <8 x float>
ret <8 x float> %tmp
}
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcG:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x float> %shuffle
}
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcH:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x float> %shuffle
}
define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
; CHECK-LABEL: splat_load_2f64_11:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %ptr
%x1 = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %x1
}
define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
; CHECK-LABEL: splat_load_4f64_2222:
; CHECK: ## BB#0:
; CHECK-NEXT: vbroadcastsd 16(%rdi), %ymm0
; CHECK-NEXT: retq
%x = load <4 x double>, <4 x double>* %ptr
%x1 = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x double> %x1
}
define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
; CHECK-LABEL: splat_load_4f32_0000:
; CHECK: ## BB#0:
; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
; CHECK-NEXT: retq
%x = load <4 x float>, <4 x float>* %ptr
%x1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x float> %x1
}
define <8 x float> @splat_load_8f32_77777777(<8 x float>* %ptr) {
; CHECK-LABEL: splat_load_8f32_77777777:
; CHECK: ## BB#0:
; CHECK-NEXT: vbroadcastss 28(%rdi), %ymm0
; CHECK-NEXT: retq
%x = load <8 x float>, <8 x float>* %ptr
%x1 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <8 x float> %x1
}