llvm-mirror/test/CodeGen/X86/sse3.ll

; These are tests for SSE3 codegen.

; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \
; RUN:              | FileCheck %s --check-prefix=X64

; Test for v8xi16 lowering where we extract the first element of the vector and
; placed it in the second element of the result.

define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
entry:
	%tmp3 = load <8 x i16>* %old
	%tmp6 = shufflevector <8 x i16> %tmp3,
                <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
                <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef  >
	store <8 x i16> %tmp6, <8 x i16>* %dest
	ret void
        
; X64: t0:
; X64:	movdqa	(%rsi), %xmm0
; X64:	pslldq	$2, %xmm0
; X64:	movdqa	%xmm0, (%rdi)
; X64:	ret
}

define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
	%tmp1 = load <8 x i16>* %A
	%tmp2 = load <8 x i16>* %B
	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
	ret <8 x i16> %tmp3
        
; X64: t1:
; X64: 	movdqa	(%rdi), %xmm0
; X64: 	pinsrw	$0, (%rsi), %xmm0
; X64: 	ret
}

define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
	ret <8 x i16> %tmp
; X64: t2:
; X64:	pextrw	$1, %xmm1, %eax
; X64:	pinsrw	$0, %eax, %xmm0
; X64:	pinsrw	$3, %eax, %xmm0
; X64:	ret
}

define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
	%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
	ret <8 x i16> %tmp
; X64: t3:
; X64: 	pextrw	$5, %xmm0, %eax
; X64: 	pshuflw	$44, %xmm0, %xmm0
; X64: 	pshufhw	$27, %xmm0, %xmm0
; X64: 	pinsrw	$3, %eax, %xmm0
; X64: 	ret
}

define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
	ret <8 x i16> %tmp
; X64: t4:
; X64: 	pextrw	$7, [[XMM0:%xmm[0-9]+]], %eax
; X64: 	pshufhw	$100, [[XMM0]], [[XMM1:%xmm[0-9]+]]
; X64: 	pinsrw	$1, %eax, [[XMM1]]
; X64: 	pextrw	$1, [[XMM0]], %eax
; X64: 	pinsrw	$4, %eax, %xmm0
; X64: 	ret
}

define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
	ret <8 x i16> %tmp
; X64: 	t5:
; X64: 		movlhps	%xmm1, %xmm0
; X64: 		pshufd	$114, %xmm0, %xmm0
; X64: 		ret
}

define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
	ret <8 x i16> %tmp
; X64: 	t6:
; X64: 		movss	%xmm1, %xmm0
; X64: 		ret
}

define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
	ret <8 x i16> %tmp
; X64: 	t7:
; X64: 		pshuflw	$-80, %xmm0, %xmm0
; X64: 		pshufhw	$-56, %xmm0, %xmm0
; X64: 		ret
}

define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
	%tmp = load <2 x i64>* %A
	%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>
	%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0
	%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1
	%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2
	%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3
	%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4
	%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5
	%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6
	%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7
	%tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0
	%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1
	%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2
	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3
	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4
	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5
	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6
	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7
	%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64>
	store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
	ret void
; X64: 	t8:
; X64: 		pshuflw	$-58, (%rsi), %xmm0
; X64: 		pshufhw	$-58, %xmm0, %xmm0
; X64: 		movdqa	%xmm0, (%rdi)
; X64: 		ret
}

define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
	%tmp = load <4 x float>* %r
	%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*
	%tmp.upgrd.4 = load double* %tmp.upgrd.3
	%tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0
	%tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1	
	%tmp6 = bitcast <2 x double> %tmp5 to <4 x float>	
	%tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0	
	%tmp7 = extractelement <4 x float> %tmp, i32 1		
	%tmp8 = extractelement <4 x float> %tmp6, i32 0		
	%tmp9 = extractelement <4 x float> %tmp6, i32 1		
	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0	
	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1
	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2
	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3
	store <4 x float> %tmp13, <4 x float>* %r
	ret void
; X64: 	t9:
; X64: 		movaps	(%rdi), %xmm0
; X64:	        movhps	(%rsi), %xmm0
; X64:	        movaps	%xmm0, (%rdi)
; X64: 		ret
}


; FIXME: This testcase produces icky code. It can be made much better!
; PR2585

@g1 = external constant <4 x i32>
@g2 = external constant <4 x i16>

define internal void @t10() nounwind {
        load <4 x i32>* @g1, align 16 
        bitcast <4 x i32> %1 to <8 x i16>
        shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
        bitcast <8 x i16> %3 to <2 x i64>  
        extractelement <2 x i64> %4, i32 0 
        bitcast i64 %5 to <4 x i16>        
        store <4 x i16> %6, <4 x i16>* @g2, align 8
        ret void
; X64: 	t10:
; X64: 		pextrw	$4, [[X0:%xmm[0-9]+]], %ecx
; X64: 		pextrw	$6, [[X0]], %eax
; X64: 		movlhps [[X0]], [[X0]]
; X64: 		pshuflw	$8, [[X0]], [[X0]]
; X64: 		pinsrw	$2, %ecx, [[X0]]
; X64: 		pinsrw	$3, %eax, [[X0]]
}


; Pack various elements via shuffles.
define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
	ret <8 x i16> %tmp7

; X64: t11:
; X64:	movd	%xmm1, %eax
; X64:	movlhps	%xmm0, %xmm0
; X64:	pshuflw	$1, %xmm0, %xmm0
; X64:	pinsrw	$1, %eax, %xmm0
; X64:	ret
}


define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
	ret <8 x i16> %tmp9

; X64: t12:
; X64: 	pextrw	$3, %xmm1, %eax
; X64: 	movlhps	%xmm0, %xmm0
; X64: 	pshufhw	$3, %xmm0, %xmm0
; X64: 	pinsrw	$5, %eax, %xmm0
; X64: 	ret
}


define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
	ret <8 x i16> %tmp9
; X64: t13:
; X64: 	punpcklqdq	%xmm0, %xmm1
; X64: 	pextrw	$3, %xmm1, %eax
; X64: 	pshufd	$52, %xmm1, %xmm0
; X64: 	pinsrw	$4, %eax, %xmm0
; X64: 	ret
}


define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
	ret <8 x i16> %tmp9
; X64: t14:
; X64: 	punpcklqdq	%xmm0, %xmm1
; X64: 	pshufhw	$8, %xmm1, %xmm0
; X64: 	ret
}


; FIXME: t15 is worse off from disabling of scheduler 2-address hack.
define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
entry:
        %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
        ret <8 x i16> %tmp8
; X64: 	t15:
; X64: 		pextrw	$7, %xmm0, %eax
; X64: 		punpcklqdq	%xmm1, %xmm0
; X64: 		pshuflw	$-128, %xmm0, %xmm0
; X64: 		pinsrw	$2, %eax, %xmm0
; X64: 		ret
}


; Test yonah where we convert a shuffle to pextrw and pinrsw
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
entry:
        %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
        %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
        ret <16 x i8> %tmp9
; X64: 	t16:
; X64: 		pextrw	$8, %xmm0, %eax
; X64: 		pslldq	$2, %xmm0
; X64: 		movd	%xmm0, %ecx
; X64: 		pextrw	$1, %xmm0, %edx
; X64: 		pinsrw	$0, %ecx, %xmm0
; X64: 		ret
}

; rdar://8520311
define <4 x i32> @t17() nounwind {
entry:
; X64: t17:
; X64:          movddup (%rax), %xmm0
  %tmp1 = load <4 x float>* undef, align 16
  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
  %tmp3 = load <4 x float>* undef, align 16
  %tmp4 = shufflevector <4 x float> %tmp2, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
  %tmp5 = bitcast <4 x float> %tmp3 to <4 x i32>
  %tmp6 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
  %tmp7 = and <4 x i32> %tmp6, <i32 undef, i32 undef, i32 -1, i32 0>
  ret <4 x i32> %tmp7
}
Fix broken x86_64 tests which specify non-64-bit cpu's. llvm-svn: 134756 2011-07-09 00:29:33 +02:00			`; These are tests for SSE3 codegen.`
convert test to filecheck format. llvm-svn: 79114 2009-08-15 19:05:03 +02:00
Fix broken x86_64 tests which specify non-64-bit cpu's. llvm-svn: 134756 2011-07-09 00:29:33 +02:00			`; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \`
specify a target triple so global variable manglings are consistent etc. llvm-svn: 79118 2009-08-15 19:35:05 +02:00			`; RUN: \| FileCheck %s --check-prefix=X64`
Fixed lowering of v816 shuffles. llvm-svn: 63252 2009-01-29 00:11:14 +01:00
			`; Test for v8xi16 lowering where we extract the first element of the vector and`
			`; placed it in the second element of the result.`

merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {`
Fixed lowering of v816 shuffles. llvm-svn: 63252 2009-01-29 00:11:14 +01:00			`entry:`
convert test to filecheck format. llvm-svn: 79114 2009-08-15 19:05:03 +02:00			`%tmp3 = load <8 x i16>* %old`
			`%tmp6 = shufflevector <8 x i16> %tmp3,`
			`<8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,`
			`<8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >`
			`store <8 x i16> %tmp6, <8 x i16>* %dest`
Fixed lowering of v816 shuffles. llvm-svn: 63252 2009-01-29 00:11:14 +01:00			`ret void`
convert test to filecheck format. llvm-svn: 79114 2009-08-15 19:05:03 +02:00
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: t0:`
Reapply r143206, with fixes. Disallow physical register lifetimes across calls, and only check for nested dependences on the special call-sequence-resource register. llvm-svn: 143660 2011-11-03 22:49:52 +01:00			`; X64: movdqa (%rsi), %xmm0`
			`; X64: pslldq $2, %xmm0`
Enable -sse-domain-fix by default. Now with tests! llvm-svn: 99954 2010-03-31 00:47:00 +02:00			`; X64: movdqa %xmm0, (%rdi)`
convert test to filecheck format. llvm-svn: 79114 2009-08-15 19:05:03 +02:00			`; X64: ret`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`}`

			`define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {`
			`%tmp1 = load <8 x i16>* %A`
			`%tmp2 = load <8 x i16>* %B`
			`%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >`
			`ret <8 x i16> %tmp3`

			`; X64: t1:`
Enable -sse-domain-fix by default. Now with tests! llvm-svn: 99954 2010-03-31 00:47:00 +02:00			`; X64: movdqa (%rdi), %xmm0`
Reapply r143206, with fixes. Disallow physical register lifetimes across calls, and only check for nested dependences on the special call-sequence-resource register. llvm-svn: 143660 2011-11-03 22:49:52 +01:00			`; X64: pinsrw $0, (%rsi), %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: ret`
			`}`

			`define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {`
			`%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >`
			`ret <8 x i16> %tmp`
			`; X64: t2:`
			`; X64: pextrw $1, %xmm1, %eax`
			`; X64: pinsrw $0, %eax, %xmm0`
			`; X64: pinsrw $3, %eax, %xmm0`
			`; X64: ret`
			`}`

			`define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {`
			`%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >`
			`ret <8 x i16> %tmp`
			`; X64: t3:`
			`; X64: pextrw $5, %xmm0, %eax`
Revert r84658 and r84691. They were causing llvm-gcc bootstrap to fail. llvm-svn: 84727 2009-10-21 03:44:44 +02:00			`; X64: pshuflw $44, %xmm0, %xmm0`
			`; X64: pshufhw $27, %xmm0, %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: pinsrw $3, %eax, %xmm0`
			`; X64: ret`
			`}`

			`define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {`
			`%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >`
			`ret <8 x i16> %tmp`
			`; X64: t4:`
Fix a batch of x86 tests to be coalescer independent. Most of these tests require a single mov instruction that can come either before or after a 2-addr instruction. -join-physregs changes the behavior, but the results are equivalent. llvm-svn: 130891 2011-05-05 01:54:51 +02:00			`; X64: pextrw $7, [[XMM0:%xmm[0-9]+]], %eax`
			`; X64: pshufhw $100, [[XMM0]], [[XMM1:%xmm[0-9]+]]`
			`; X64: pinsrw $1, %eax, [[XMM1]]`
			`; X64: pextrw $1, [[XMM0]], %eax`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: pinsrw $4, %eax, %xmm0`
			`; X64: ret`
			`}`

			`define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {`
			`%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >`
			`ret <8 x i16> %tmp`
			`; X64: t5:`
			`; X64: movlhps %xmm1, %xmm0`
			`; X64: pshufd $114, %xmm0, %xmm0`
			`; X64: ret`
			`}`

			`define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {`
			`%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >`
			`ret <8 x i16> %tmp`
			`; X64: t6:`
			`; X64: movss %xmm1, %xmm0`
			`; X64: ret`
			`}`

			`define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {`
			`%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >`
			`ret <8 x i16> %tmp`
			`; X64: t7:`
change selectiondag to add the sign extended versions of immediate operands to instructions instead of zero extended ones. This makes the asmprinter print signed values more consistently. This apparently only really affects the X86 backend. llvm-svn: 81265 2009-09-09 01:05:44 +02:00			`; X64: pshuflw $-80, %xmm0, %xmm0`
			`; X64: pshufhw $-56, %xmm0, %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: ret`
			`}`

			`define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {`
			`%tmp = load <2 x i64>* %A`
			`%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>`
			`%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0`
			`%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1`
			`%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2`
			`%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3`
			`%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 4`
			`%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5`
			`%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 6`
			`%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7`
			`%tmp8 = insertelement <8 x i16> undef, i16 %tmp2, i32 0`
			`%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1`
			`%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp0, i32 2`
			`%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3`
			`%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp6, i32 4`
			`%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5`
			`%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp4, i32 6`
			`%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7`
			`%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64>`
			`store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res`
			`ret void`
			`; X64: t8:`
change selectiondag to add the sign extended versions of immediate operands to instructions instead of zero extended ones. This makes the asmprinter print signed values more consistently. This apparently only really affects the X86 backend. llvm-svn: 81265 2009-09-09 01:05:44 +02:00			`; X64: pshuflw $-58, (%rsi), %xmm0`
			`; X64: pshufhw $-58, %xmm0, %xmm0`
Enable -sse-domain-fix by default. Now with tests! llvm-svn: 99954 2010-03-31 00:47:00 +02:00			`; X64: movdqa %xmm0, (%rdi)`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: ret`
			`}`

			`define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {`
			`%tmp = load <4 x float>* %r`
			`%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*`
			`%tmp.upgrd.4 = load double* %tmp.upgrd.3`
			`%tmp.upgrd.5 = insertelement <2 x double> undef, double %tmp.upgrd.4, i32 0`
			`%tmp5 = insertelement <2 x double> %tmp.upgrd.5, double undef, i32 1`
			`%tmp6 = bitcast <2 x double> %tmp5 to <4 x float>`
			`%tmp.upgrd.6 = extractelement <4 x float> %tmp, i32 0`
			`%tmp7 = extractelement <4 x float> %tmp, i32 1`
			`%tmp8 = extractelement <4 x float> %tmp6, i32 0`
			`%tmp9 = extractelement <4 x float> %tmp6, i32 1`
			`%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.6, i32 0`
			`%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1`
			`%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2`
			`%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3`
			`store <4 x float> %tmp13, <4 x float>* %r`
			`ret void`
			`; X64: t9:`
Fix some issues in WalkChainUsers dealing with CopyToReg/CopyFromReg/INLINEASM. These are annoying because they have the same opcode before an after isel. Fix this by setting their NodeID to -1 to indicate that they are selected, just like what automatically happens when selecting things that end up being machine nodes. With that done, give IsLegalToFold a new flag that causes it to ignore chains. This lets the HandleMergeInputChains routine be the one place that validates chains after a match is successful, enabling the new hotness in chain processing. This smarter chain processing eliminates the need for "PreprocessRMW" in the X86 and MSP430 backends and enables MSP to start matching it's multiple mem operand instructions more aggressively. I currently #if out the dead code in the X86 backend and MSP backend, I'll remove it for real in a follow-on patch. The testcase changes are: test/CodeGen/X86/sse3.ll: we generate better code test/CodeGen/X86/store_op_load_fold2.ll: PreprocessRMW was miscompiling this before, we now generate correct code Convert it to filecheck while I'm at it. test/CodeGen/MSP430/Inst16mm.ll: Add a testcase for mem/mem folding to make anton happy. :) llvm-svn: 97596 2010-03-02 23:20:06 +01:00			`; X64: movaps (%rdi), %xmm0`
			`; X64: movhps (%rsi), %xmm0`
			`; X64: movaps %xmm0, (%rdi)`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: ret`
			`}`



			`; FIXME: This testcase produces icky code. It can be made much better!`
			`; PR2585`

			`@g1 = external constant <4 x i32>`
			`@g2 = external constant <4 x i16>`

			`define internal void @t10() nounwind {`
			`load <4 x i32>* @g1, align 16`
			`bitcast <4 x i32> %1 to <8 x i16>`
			`shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >`
			`bitcast <8 x i16> %3 to <2 x i64>`
			`extractelement <2 x i64> %4, i32 0`
			`bitcast i64 %5 to <4 x i16>`
			`store <4 x i16> %6, <4 x i16>* @g2, align 8`
			`ret void`
			`; X64: t10:`
Teach two-address pass to re-schedule two-address instructions (or the kill instructions of the two-address operands) in order to avoid inserting copies. This fixes the few regressions introduced when the two-address hack was disabled (without regressing the improvements). rdar://10422688 llvm-svn: 144559 2011-11-14 20:48:55 +01:00			`; X64: pextrw $4, [[X0:%xmm[0-9]+]], %ecx`
Fix register-dependent X86 tests. llvm-svn: 128867 2011-04-05 02:32:44 +02:00			`; X64: pextrw $6, [[X0]], %eax`
Teach two-address pass to re-schedule two-address instructions (or the kill instructions of the two-address operands) in order to avoid inserting copies. This fixes the few regressions introduced when the two-address hack was disabled (without regressing the improvements). rdar://10422688 llvm-svn: 144559 2011-11-14 20:48:55 +01:00			`; X64: movlhps [[X0]], [[X0]]`
			`; X64: pshuflw $8, [[X0]], [[X0]]`
			`; X64: pinsrw $2, %ecx, [[X0]]`
			`; X64: pinsrw $3, %eax, [[X0]]`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`}`


			`; Pack various elements via shuffles.`
			`define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {`
			`entry:`
			`%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >`
			`ret <8 x i16> %tmp7`

			`; X64: t11:`
Turn on post-alloc scheduling for x86. llvm-svn: 84431 2009-10-18 21:57:27 +02:00			`; X64: movd %xmm1, %eax`
Turning off post-ra scheduling for x86. It isn't a consistent win. llvm-svn: 98810 2010-03-18 07:55:42 +01:00			`; X64: movlhps %xmm0, %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: pshuflw $1, %xmm0, %xmm0`
			`; X64: pinsrw $1, %eax, %xmm0`
			`; X64: ret`
			`}`


			`define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {`
			`entry:`
			`%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >`
			`ret <8 x i16> %tmp9`

			`; X64: t12:`
Turn on post-alloc scheduling for x86. llvm-svn: 84431 2009-10-18 21:57:27 +02:00			`; X64: pextrw $3, %xmm1, %eax`
Turning off post-ra scheduling for x86. It isn't a consistent win. llvm-svn: 98810 2010-03-18 07:55:42 +01:00			`; X64: movlhps %xmm0, %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: pshufhw $3, %xmm0, %xmm0`
			`; X64: pinsrw $5, %eax, %xmm0`
			`; X64: ret`
			`}`


			`define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {`
			`entry:`
			`%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >`
			`ret <8 x i16> %tmp9`
			`; X64: t13:`
			`; X64: punpcklqdq %xmm0, %xmm1`
			`; X64: pextrw $3, %xmm1, %eax`
			`; X64: pshufd $52, %xmm1, %xmm0`
			`; X64: pinsrw $4, %eax, %xmm0`
			`; X64: ret`
			`}`


			`define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {`
			`entry:`
			`%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >`
			`ret <8 x i16> %tmp9`
			`; X64: t14:`
			`; X64: punpcklqdq %xmm0, %xmm1`
			`; X64: pshufhw $8, %xmm1, %xmm0`
			`; X64: ret`
			`}`


Use a bigger hammer to fix PR11314 by disabling the "forcing two-address instruction lower optimization" in the pre-RA scheduler. The optimization, rather the hack, was done before MI use-list was available. Now we should be able to implement it in a better way, perhaps in the two-address pass until a MI scheduler is available. Now that the scheduler has to backtrack to handle call sequences. Adding artificial scheduling constraints is just not safe. Furthermore, the hack is not taking all the other scheduling decisions into consideration so it's just as likely to pessimize code. So I view disabling this optimization goodness regardless of PR11314. llvm-svn: 144267 2011-11-10 08:43:16 +01:00			`; FIXME: t15 is worse off from disabling of scheduler 2-address hack.`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {`
			`entry:`
			`%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >`
			`ret <8 x i16> %tmp8`
			`; X64: t15:`
Teach two-address pass to re-schedule two-address instructions (or the kill instructions of the two-address operands) in order to avoid inserting copies. This fixes the few regressions introduced when the two-address hack was disabled (without regressing the improvements). rdar://10422688 llvm-svn: 144559 2011-11-14 20:48:55 +01:00			`; X64: pextrw $7, %xmm0, %eax`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: punpcklqdq %xmm1, %xmm0`
change selectiondag to add the sign extended versions of immediate operands to instructions instead of zero extended ones. This makes the asmprinter print signed values more consistently. This apparently only really affects the X86 backend. llvm-svn: 81265 2009-09-09 01:05:44 +02:00			`; X64: pshuflw $-128, %xmm0, %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: pinsrw $2, %eax, %xmm0`
			`; X64: ret`
			`}`


			`; Test yonah where we convert a shuffle to pextrw and pinrsw`
			`define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {`
			`entry:`
			`%tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >`
			`%tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >`
			`ret <16 x i8> %tmp9`
			`; X64: t16:`
Teach two-address pass to re-schedule two-address instructions (or the kill instructions of the two-address operands) in order to avoid inserting copies. This fixes the few regressions introduced when the two-address hack was disabled (without regressing the improvements). rdar://10422688 llvm-svn: 144559 2011-11-14 20:48:55 +01:00			`; X64: pextrw $8, %xmm0, %eax`
			`; X64: pslldq $2, %xmm0`
			`; X64: movd %xmm0, %ecx`
			`; X64: pextrw $1, %xmm0, %edx`
			`; X64: pinsrw $0, %ecx, %xmm0`
merge a bunch more sse3 tests into sse3.ll llvm-svn: 79115 2009-08-15 19:21:44 +02:00			`; X64: ret`
			`}`
Canonicalize X86ISD::MOVDDUP nodes to v2f64 to make sure all cases match. Also eliminate unneeded isel patterns. rdar://8520311 llvm-svn: 115977 2010-10-07 22:50:20 +02:00
			`; rdar://8520311`
			`define <4 x i32> @t17() nounwind {`
			`entry:`
			`; X64: t17:`
			`; X64: movddup (%rax), %xmm0`
			`%tmp1 = load <4 x float>* undef, align 16`
			`%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>`
			`%tmp3 = load <4 x float>* undef, align 16`
			`%tmp4 = shufflevector <4 x float> %tmp2, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>`
			`%tmp5 = bitcast <4 x float> %tmp3 to <4 x i32>`
			`%tmp6 = shufflevector <4 x i32> %tmp5, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>`
			`%tmp7 = and <4 x i32> %tmp6, <i32 undef, i32 undef, i32 -1, i32 0>`
			`ret <4 x i32> %tmp7`
			`}`