llvm-mirror/test/CodeGen/X86/avx-splat.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s


; CHECK: vpunpcklbw %xmm
; CHECK-NEXT: vpunpckhbw %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <32 x i8> %shuffle
}

; CHECK: vpunpckhwd %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <16 x i16> %shuffle
}

; CHECK: vmovq
; CHECK-NEXT: vmovlhps %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
  ret <4 x i64> %vecinit6.i
}

; CHECK: vpermilpd $0
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
  ret <4 x double> %vecinit6.i
}

; Test this simple opt:
;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
; To:
;   shuffle (vload ptr)), undef, <1, 1, 1, 1>
; CHECK: vmovdqa
; CHECK-NEXT: vpshufd $-1
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcE() nounwind {
allocas:
  %udx495 = alloca [18 x [18 x float]], align 32
  br label %for_test505.preheader

for_test505.preheader:                            ; preds = %for_test505.preheader, %allocas
  br i1 undef, label %for_exit499, label %for_test505.preheader

for_exit499:                                      ; preds = %for_test505.preheader
  br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247

load.i1247:                                       ; preds = %for_exit499
  %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
  %ptr.i1237 = bitcast float* %ptr1227 to i32*
  %val.i1238 = load i32* %ptr.i1237, align 4
  %ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
  %ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
  %phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
  br label %__load_and_broadcast_32.exit1249

__load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_exit499
  %load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
  ret <8 x float> %load_broadcast12281250
}

; CHECK: vpshufd $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
  %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
  %tmp = bitcast <8 x i32> %ret7 to <8 x float>
  ret <8 x float> %tmp
}

; CHECK: vpshufd  $0
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  ret <8 x float> %shuffle
}

; CHECK: vextractf128  $1
; CHECK-NEXT: vpshufd
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <8 x float> %shuffle
}
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 llvm-svn: 135662 2011-07-21 03:55:47 +02:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck %s`


Add support for breaking 256-bit v16i16 and v32i8 VSETCC into two 128-bit ones, avoiding sclarization. Add vex form of pcmpeqq and pcmpgtq. Fixes more cases for PR10712. llvm-svn: 138321 2011-08-23 06:36:33 +02:00			`; CHECK: vpunpcklbw %xmm`
			`; CHECK-NEXT: vpunpckhbw %xmm`
X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. llvm-svn: 173569 2013-01-26 12:44:21 +01:00			`; CHECK-NEXT: vpshufd $85`
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 llvm-svn: 135662 2011-07-21 03:55:47 +02:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <32 x i8> %shuffle`
			`}`

Add support for breaking 256-bit v16i16 and v32i8 VSETCC into two 128-bit ones, avoiding sclarization. Add vex form of pcmpeqq and pcmpgtq. Fixes more cases for PR10712. llvm-svn: 138321 2011-08-23 06:36:33 +02:00			`; CHECK: vpunpckhwd %xmm`
X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. llvm-svn: 173569 2013-01-26 12:44:21 +01:00			`; CHECK-NEXT: vpshufd $85`
- Register v16i16 as valid VR256 register class - Add more bitcasts for v16i16 - Since 135661 and 135662 already added the splat logic, just add one more splat test for v16i16 llvm-svn: 135663 2011-07-21 04:24:08 +02:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <16 x i16> %shuffle`
			`}`

Remove some instructions that existed to provide aliases to the assembler. Can be done with InstAlias instead. Unfortunately, this was causing printer to use 'vmovq' or 'vmovd' based on what was parsed. To cleanup the inconsistencies convert all 'vmovd' with 64-bit registers to 'vmovq', but provide an alias so that 'vmovd' will still parse. llvm-svn: 192171 2013-10-08 07:53:50 +02:00			`; CHECK: vmovq`
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! llvm-svn: 138392 2011-08-24 00:06:37 +02:00			`; CHECK-NEXT: vmovlhps %xmm`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 01:05:25 +02:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0`
			`%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1`
			`%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2`
			`%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3`
			`ret <4 x i64> %vecinit6.i`
			`}`

Add instruction selection for 256-bit VPSHUFD and 128-bit VPERMILPS/VPERMILPD. llvm-svn: 149968 2012-02-07 07:28:42 +01:00			`; CHECK: vpermilpd $0`
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! llvm-svn: 138392 2011-08-24 00:06:37 +02:00			`; CHECK-NEXT: vinsertf128 $1`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 01:05:25 +02:00			`define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x double> undef, double %q, i32 0`
			`%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1`
			`%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2`
			`%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3`
			`ret <4 x double> %vecinit6.i`
			`}`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-02 18:06:18 +02:00
			`; Test this simple opt:`
			`; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>`
			`; To:`
			`; shuffle (vload ptr)), undef, <1, 1, 1, 1>`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-26 20:24:31 +01:00			`; CHECK: vmovdqa`
			`; CHECK-NEXT: vpshufd $-1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 04:49:44 +02:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <8 x float> @funcE() nounwind {`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-02 18:06:18 +02:00			`allocas:`
			`%udx495 = alloca [18 x [18 x float]], align 32`
			`br label %for_test505.preheader`

			`for_test505.preheader: ; preds = %for_test505.preheader, %allocas`
			`br i1 undef, label %for_exit499, label %for_test505.preheader`

			`for_exit499: ; preds = %for_test505.preheader`
			`br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247`

			`load.i1247: ; preds = %for_exit499`
			`%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1`
			`%ptr.i1237 = bitcast float* %ptr1227 to i32*`
			`%val.i1238 = load i32* %ptr.i1237, align 4`
			`%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6`
			`%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7`
			`%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>`
			`br label %__load_and_broadcast_32.exit1249`

			`__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499`
			`%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 04:49:44 +02:00			`ret <8 x float> %load_broadcast12281250`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-02 18:06:18 +02:00			`}`

Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-26 20:24:31 +01:00			`; CHECK: vpshufd $0`
			`; CHECK-NEXT: vinsertf128 $1`
Update test to not use the scalar type to splat from a load llvm-svn: 137809 2011-08-17 04:29:15 +02:00			`define <8 x float> @funcF(i32 %val) nounwind {`
Use the splat index to generate the desired shuffle. Otherwise we could only get undefs and the vector shuffle becomes an undef, generating wrong code. llvm-svn: 137295 2011-08-11 04:49:41 +02:00			`%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6`
			`%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7`
			`%tmp = bitcast <8 x i32> %ret7 to <8 x float>`
			`ret <8 x float> %tmp`
			`}`

X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. llvm-svn: 169624 2012-12-07 20:01:13 +01:00			`; CHECK: vpshufd $0`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-26 20:24:31 +01:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 04:49:44 +02:00			`define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>`
			`ret <8 x float> %shuffle`
			`}`

			`; CHECK: vextractf128 $1`
X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. llvm-svn: 169624 2012-12-07 20:01:13 +01:00			`; CHECK-NEXT: vpshufd`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-26 20:24:31 +01:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 04:49:44 +02:00			`define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <8 x float> %shuffle`
			`}`