llvm-mirror/test/CodeGen/X86/avx-basic.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s

@x = common global <8 x float> zeroinitializer, align 32
@y = common global <4 x double> zeroinitializer, align 32
@z = common global <4 x float> zeroinitializer, align 16

define void @zero128() nounwind ssp {
entry:
  ; CHECK: vpxor
  ; CHECK: vmovaps
  store <4 x float> zeroinitializer, <4 x float>* @z, align 16
  ret void
}

define void @zero256() nounwind ssp {
entry:
  ; CHECK: vxorps
  ; CHECK: vmovaps
  ; CHECK: vmovaps
  store <8 x float> zeroinitializer, <8 x float>* @x, align 32
  store <4 x double> zeroinitializer, <4 x double>* @y, align 32
  ret void
}

; CHECK: vpcmpeqd
; CHECK: vinsertf128 $1
define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
allocas:
  %ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
  store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x
float>* %ptr2vec615, align 32
  ret void
}

; CHECK: vpcmpeqd
; CHECK: vinsertf128 $1
define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {
allocas:
  %ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*
  store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32
  ret void
}

;;; Just make sure this doesn't crash
; CHECK: _ISelCrash
define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
  ret <4 x i64> %shuffle
}

;;;
;;; Check that some 256-bit vectors are xformed into 128 ops
; CHECK: _A
; CHECK: vshufpd $1
; CHECK-NEXT: vextractf128 $1
; CHECK-NEXT: vshufpd $1
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
  ret <4 x i64> %shuffle
}

; CHECK: _B
; CHECK: vshufpd $1, %ymm
define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6>
  ret <4 x i64> %shuffle
}

; CHECK: movlhps
; CHECK-NEXT: vextractf128  $1
; CHECK-NEXT: movlhps
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6>
  ret <4 x i64> %shuffle
}

; CHECK: vpshufd $-96
; CHECK: vpshufd $-6
; CHECK: vinsertf128 $1
define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11>
  ret <8 x i32> %shuffle
}

;;; Don't crash on movd
; CHECK: _VMOVZQI2PQI
; CHECK: vmovd (%
define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
allocas:
  %ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
  %val.i34.i = load i32* %ptrcast.i33.i, align 4
  %ptroffset.i22.i992 = getelementptr [0 x float]* %aFOO, i64 0, i64 1
  %ptrcast.i23.i = bitcast float* %ptroffset.i22.i992 to i32*
  %val.i24.i = load i32* %ptrcast.i23.i, align 4
  %updatedret.i30.i = insertelement <8 x i32> undef, i32 %val.i34.i, i32 1
  ret <8 x i32> %updatedret.i30.i
}
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 01:05:25 +02:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck %s`
Begin to support some vector operations for AVX 256-bit intructions. The long term goal here is to be able to match enough of vector_shuffle and build_vector so all avx intrinsics which aren't mapped to their own built-ins but to shufflevector calls can be codegen'd. This is the first (baby) step, support building zeroed vectors. llvm-svn: 110897 2010-08-12 04:06:36 +02:00
			`@x = common global <8 x float> zeroinitializer, align 32`
			`@y = common global <4 x double> zeroinitializer, align 32`
Rename and tidy up tests llvm-svn: 137103 2011-08-09 05:04:23 +02:00			`@z = common global <4 x float> zeroinitializer, align 16`
Begin to support some vector operations for AVX 256-bit intructions. The long term goal here is to be able to match enough of vector_shuffle and build_vector so all avx intrinsics which aren't mapped to their own built-ins but to shufflevector calls can be codegen'd. This is the first (baby) step, support building zeroed vectors. llvm-svn: 110897 2010-08-12 04:06:36 +02:00
Rename and tidy up tests llvm-svn: 137103 2011-08-09 05:04:23 +02:00			`define void @zero128() nounwind ssp {`
			`entry:`
Change all checks regarding the presence of any SSE level to always take into consideration the presence of AVX. This change, together with the SSEDomainFix enabled for AVX, makes AVX codegen to always (hopefully) emit the same code as SSE for 128-bit vector ops. I don't have a testcase for this, but AVX now beats SSE in performance for 128-bit ops in the majority of programas in the llvm testsuite llvm-svn: 139817 2011-09-15 20:27:36 +02:00			`; CHECK: vpxor`
Rename and tidy up tests llvm-svn: 137103 2011-08-09 05:04:23 +02:00			`; CHECK: vmovaps`
			`store <4 x float> zeroinitializer, <4 x float>* @z, align 16`
			`ret void`
			`}`

			`define void @zero256() nounwind ssp {`
Begin to support some vector operations for AVX 256-bit intructions. The long term goal here is to be able to match enough of vector_shuffle and build_vector so all avx intrinsics which aren't mapped to their own built-ins but to shufflevector calls can be codegen'd. This is the first (baby) step, support building zeroed vectors. llvm-svn: 110897 2010-08-12 04:06:36 +02:00			`entry:`
			`; CHECK: vxorps`
			`; CHECK: vmovaps`
			`; CHECK: vmovaps`
			`store <8 x float> zeroinitializer, <8 x float>* @x, align 32`
			`store <4 x double> zeroinitializer, <4 x double>* @y, align 32`
			`ret void`
			`}`
Codegen allonesvector better while using AVX: vpcmpeqd + vinsertf128 This also fixes PR10452 llvm-svn: 136004 2011-07-26 01:05:32 +02:00
			`; CHECK: vpcmpeqd`
			`; CHECK: vinsertf128 $1`
			`define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {`
			`allocas:`
			`%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*`
			`store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float`
			`0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float`
			`0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000>, <8 x`
			`float>* %ptr2vec615, align 32`
			`ret void`
			`}`
Since vectors with all ones can't be created with a 256-bit instruction, avoid returning early for v8i32 types, which would only be valid for vector with all zeros. Also split the handling of zeros and ones into separate checking logic since they are handled differently. This fixes PR10547 llvm-svn: 136642 2011-08-01 21:51:53 +02:00
			`; CHECK: vpcmpeqd`
			`; CHECK: vinsertf128 $1`
			`define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {`
			`allocas:`
			`%ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*`
			`store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32`
			`ret void`
			`}`
Fix PR10492 by teaching MOVHLPS and MOVLPS mask matching to be more strict. llvm-svn: 137324 2011-08-11 20:59:13 +02:00
			`;;; Just make sure this doesn't crash`
			`; CHECK: _ISelCrash`
			`define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>`
			`ret <4 x i64> %shuffle`
			`}`
Instead of always leaving the work to the generic legalizer when there is no support for native 256-bit shuffles, be more smart in some cases, for example, when you can extract specific 128-bit parts and use regular 128-bit shuffles for them. Example: For this shuffle: shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> This was expanded to: vextractf128 $1, %ymm1, %xmm2 vpextrq $0, %xmm2, %rax vmovd %rax, %xmm1 vpextrq $1, %xmm2, %rax vmovd %rax, %xmm2 vpunpcklqdq %xmm1, %xmm2, %xmm1 vpextrq $0, %xmm0, %rax vmovd %rax, %xmm2 vpextrq $1, %xmm0, %rax vmovd %rax, %xmm0 vpunpcklqdq %xmm2, %xmm0, %xmm0 vinsertf128 $1, %xmm1, %ymm0, %ymm0 ret Now we get: vshufpd $1, %xmm0, %xmm0, %xmm0 vextractf128 $1, %ymm1, %xmm1 vshufpd $1, %xmm1, %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 llvm-svn: 137733 2011-08-16 20:21:54 +02:00
			`;;;`
			`;;; Check that some 256-bit vectors are xformed into 128 ops`
			`; CHECK: _A`
			`; CHECK: vshufpd $1`
			`; CHECK-NEXT: vextractf128 $1`
			`; CHECK-NEXT: vshufpd $1`
			`; CHECK-NEXT: vinsertf128 $1`
			`define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>`
			`ret <4 x i64> %shuffle`
			`}`

Add support for 256-bit versions of VSHUFPD and VSHUFPS. llvm-svn: 138546 2011-08-25 04:58:26 +02:00			`; CHECK: _B`
			`; CHECK: vshufpd $1, %ymm`
Instead of always leaving the work to the generic legalizer when there is no support for native 256-bit shuffles, be more smart in some cases, for example, when you can extract specific 128-bit parts and use regular 128-bit shuffles for them. Example: For this shuffle: shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> This was expanded to: vextractf128 $1, %ymm1, %xmm2 vpextrq $0, %xmm2, %rax vmovd %rax, %xmm1 vpextrq $1, %xmm2, %rax vmovd %rax, %xmm2 vpunpcklqdq %xmm1, %xmm2, %xmm1 vpextrq $0, %xmm0, %rax vmovd %rax, %xmm2 vpextrq $1, %xmm0, %rax vmovd %rax, %xmm0 vpunpcklqdq %xmm2, %xmm0, %xmm0 vinsertf128 $1, %xmm1, %ymm0, %ymm0 ret Now we get: vshufpd $1, %xmm0, %xmm0, %xmm0 vextractf128 $1, %ymm1, %xmm1 vshufpd $1, %xmm1, %xmm1, %xmm1 vinsertf128 $1, %xmm1, %ymm0, %ymm0 llvm-svn: 137733 2011-08-16 20:21:54 +02:00			`define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6>`
			`ret <4 x i64> %shuffle`
			`}`

			`; CHECK: movlhps`
			`; CHECK-NEXT: vextractf128 $1`
			`; CHECK-NEXT: movlhps`
			`; CHECK-NEXT: vinsertf128 $1`
			`define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6>`
			`ret <4 x i64> %shuffle`
			`}`

			`; CHECK: vpshufd $-96`
			`; CHECK: vpshufd $-6`
			`; CHECK: vinsertf128 $1`
			`define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11>`
			`ret <8 x i32> %shuffle`
			`}`

Fix PR10845. SUBREG_TO_REG shouldn't be used when the input and destination types are equal! llvm-svn: 139553 2011-09-13 00:59:23 +02:00			`;;; Don't crash on movd`
			`; CHECK: _VMOVZQI2PQI`
			`; CHECK: vmovd (%`
			`define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {`
			`allocas:`
			`%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*`
			`%val.i34.i = load i32* %ptrcast.i33.i, align 4`
			`%ptroffset.i22.i992 = getelementptr [0 x float]* %aFOO, i64 0, i64 1`
			`%ptrcast.i23.i = bitcast float* %ptroffset.i22.i992 to i32*`
			`%val.i24.i = load i32* %ptrcast.i23.i, align 4`
			`%updatedret.i30.i = insertelement <8 x i32> undef, i32 %val.i34.i, i32 1`
			`ret <8 x i32> %updatedret.i30.i`
			`}`