llvm-mirror/test/CodeGen/X86/vec_shuffle-37.ll

; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0

define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
entry:
; CHECK: movaps  ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]]
; CHECK: movaps  %[[XMM0]], %[[XMM1:xmm[0-9]+]]
; CHECK-NEXT: movss   %xmm{{[0-9]+}}, %[[XMM1]]
; CHECK-NEXT: shufps  $36, %[[XMM1]], %[[XMM0]]
  %0 = load <4 x i32>* undef, align 16
  %1 = load <4 x i32>* %a0, align 16
  %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
  ret <4 x i32> %2
}

define void @t01(double* %a0) nounwind ssp {
entry:
; CHECK_O0: movsd (%eax), %xmm0
; CHECK_O0: unpcklpd  %xmm0, %xmm0
  %tmp93 = load double* %a0, align 8
  %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1
  store <2 x double> %vecinit94, <2 x double>* undef
  ret void
}

define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
entry:
; CHECK: t02
; CHECK: movaps
; CHECK: shufps
; CHECK: pshufd
; CHECK: movq
; CHECK: ret
  %0 = bitcast <8 x i32>* %source to <4 x i32>*
  %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
  %tmp2 = load <4 x i32>* %arrayidx, align 16
  %tmp3 = extractelement <4 x i32> %tmp2, i32 0
  %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0
  %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1
  %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*
  %tmp8 = load <4 x i32>* %1, align 16
  %tmp9 = extractelement <4 x i32> %tmp8, i32 1
  %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1
  store <2 x i32> %tmp11, <2 x i32>* %dest, align 8
  ret void
}
Add mcpu to tests to prevent them from using AVX instructions on Sandy Bridge after r155618. llvm-svn: 155696 2012-04-27 09:11:58 +02:00			`; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 \| FileCheck %s`
			`; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 \| FileCheck %s`
Add one more pattern to fallback movddup llvm-svn: 113522 2010-09-09 20:48:34 +02:00			`; RUN: llc -O0 < %s -march=x86 -mcpu=core2 \| FileCheck %s --check-prefix=CHECK_O0`
Using target specific nodes for shuffle nodes makes the mask check more strict, breaking some cases not checked in the testsuite, but also exposes some foldings not done before, as this example: movaps (%rdi), %xmm0 movaps (%rax), %xmm1 movaps %xmm0, %xmm2 movss %xmm1, %xmm2 shufps $36, %xmm2, %xmm0 now is generated as: movaps (%rdi), %xmm0 movaps %xmm0, %xmm1 movlps (%rax), %xmm1 shufps $36, %xmm1, %xmm0 llvm-svn: 112753 2010-09-02 00:33:20 +02:00
			`define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {`
			`entry:`
Allocate virtual registers in ascending order. This is just the fallback tie-breaker ordering, the main allocation order is still descending size. Patch by Shamil Kurmangaleev! llvm-svn: 153904 2012-04-03 00:30:39 +02:00			`; CHECK: movaps ({{%rdi\|%rcx}}), %[[XMM0:xmm[0-9]+]]`
			`; CHECK: movaps %[[XMM0]], %[[XMM1:xmm[0-9]+]]`
			`; CHECK-NEXT: movss %xmm{{[0-9]+}}, %[[XMM1]]`
			`; CHECK-NEXT: shufps $36, %[[XMM1]], %[[XMM0]]`
Using target specific nodes for shuffle nodes makes the mask check more strict, breaking some cases not checked in the testsuite, but also exposes some foldings not done before, as this example: movaps (%rdi), %xmm0 movaps (%rax), %xmm1 movaps %xmm0, %xmm2 movss %xmm1, %xmm2 shufps $36, %xmm2, %xmm0 now is generated as: movaps (%rdi), %xmm0 movaps %xmm0, %xmm1 movlps (%rax), %xmm1 shufps $36, %xmm1, %xmm0 llvm-svn: 112753 2010-09-02 00:33:20 +02:00			`%0 = load <4 x i32>* undef, align 16`
			`%1 = load <4 x i32>* %a0, align 16`
			`%2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>`
			`ret <4 x i32> %2`
			`}`

Add one more pattern to fallback movddup llvm-svn: 113522 2010-09-09 20:48:34 +02:00			`define void @t01(double* %a0) nounwind ssp {`
			`entry:`
			`; CHECK_O0: movsd (%eax), %xmm0`
			`; CHECK_O0: unpcklpd %xmm0, %xmm0`
			`%tmp93 = load double* %a0, align 8`
			`%vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1`
			`store <2 x double> %vecinit94, <2 x double>* undef`
			`ret void`
			`}`
The following X86 pattern is incorrect: def : Pat<(X86Movss VR128:$src1, (bc_v4i32 (v2i64 (load addr:$src2)))), (MOVLPSrm VR128:$src1, addr:$src2)>; This matches a MOVSS dag with a MOVLPS instruction. However, MOVSS will replace only the low 32 bits of the register, while the MOVLPS instruction will replace the low 64 bits. A testcase is added and illustrates the bug and also modified the one that was already present. Patch by Tanya Lattner. llvm-svn: 137227 2011-08-10 19:45:17 +02:00
			`define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {`
			`entry:`
Add a new DAGCombine optimization for BUILD_VECTOR. If all of the inputs are zero/any_extended, create a new simple BV which can be further optimized by other BV optimizations. llvm-svn: 143297 2011-10-29 23:23:04 +02:00			`; CHECK: t02`
1. Remove the part of r153848 which optimizes shuffle-of-shuffle into a new shuffle node because it could introduce new shuffle nodes that were not supported efficiently by the target. 2. Add a more restrictive shuffle-of-shuffle optimization for cases where the second shuffle reverses the transformation of the first shuffle. llvm-svn: 154266 2012-04-07 23:19:08 +02:00			`; CHECK: movaps`
			`; CHECK: shufps`
			`; CHECK: pshufd`
			`; CHECK: movq`
			`; CHECK: ret`
The following X86 pattern is incorrect: def : Pat<(X86Movss VR128:$src1, (bc_v4i32 (v2i64 (load addr:$src2)))), (MOVLPSrm VR128:$src1, addr:$src2)>; This matches a MOVSS dag with a MOVLPS instruction. However, MOVSS will replace only the low 32 bits of the register, while the MOVLPS instruction will replace the low 64 bits. A testcase is added and illustrates the bug and also modified the one that was already present. Patch by Tanya Lattner. llvm-svn: 137227 2011-08-10 19:45:17 +02:00			`%0 = bitcast <8 x i32>* %source to <4 x i32>*`
			`%arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3`
			`%tmp2 = load <4 x i32>* %arrayidx, align 16`
			`%tmp3 = extractelement <4 x i32> %tmp2, i32 0`
			`%tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0`
			`%arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1`
			`%1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*`
			`%tmp8 = load <4 x i32>* %1, align 16`
			`%tmp9 = extractelement <4 x i32> %tmp8, i32 1`
			`%tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1`
			`store <2 x i32> %tmp11, <2 x i32>* %dest, align 8`
			`ret void`
			`}`