llvm-mirror/test/CodeGen/X86/vec_shuffle-21.ll

; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah -o %t -f
; RUN: grep pshuflw %t | count 1
; RUN: grep pextrw %t | count 2
; RUN: grep pinsrw %t | count 2
; PR2585

; FIXME: This testcase produces icky code. It can be made much better!

external constant <4 x i32>             ; <<4 x i32>*>:0 [#uses=1]
external constant <4 x i16>             ; <<4 x i16>*>:1 [#uses=1]

define internal void @""() {
        load <4 x i32>* @0, align 16            ; <<4 x i32>>:1 [#uses=1]
        bitcast <4 x i32> %1 to <8 x i16>               ; <<8 x i16>>:2[#uses=1]
        shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >               ; <<8x i16>>:3 [#uses=1]
        bitcast <8 x i16> %3 to <2 x i64>               ; <<2 x i64>>:4 [#uses=1]
        extractelement <2 x i64> %4, i32 0              ; <i64>:5 [#uses=1]
        bitcast i64 %5 to <4 x i16>             ; <<4 x i16>>:6 [#uses=1]
        store <4 x i16> %6, <4 x i16>* @1, align 8
        ret void
}
Generate better code for v8i16 shuffles on SSE2 Generate better code for v16i8 shuffles on SSE2 (avoids stack) Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops. Document the shuffle matching logic and add some FIXMEs for later further cleanups. New tests that test the above. Examples: New: _shuf2: pextrw $7, %xmm0, %eax punpcklqdq %xmm1, %xmm0 pshuflw $128, %xmm0, %xmm0 pinsrw $2, %eax, %xmm0 Old: _shuf2: pextrw $2, %xmm0, %eax pextrw $7, %xmm0, %ecx pinsrw $2, %ecx, %xmm0 pinsrw $3, %eax, %xmm0 movd %xmm1, %eax pinsrw $4, %eax, %xmm0 ret ========= New: _shuf4: punpcklqdq %xmm1, %xmm0 pshufb LCPI1_0, %xmm0 Old: _shuf4: pextrw $3, %xmm0, %eax movsd %xmm1, %xmm0 pextrw $3, %xmm1, %ecx pinsrw $4, %ecx, %xmm0 pinsrw $5, %eax, %xmm0 ======== New: _shuf1: pushl %ebx pushl %edi pushl %esi pextrw $1, %xmm0, %eax rolw $8, %ax movd %xmm0, %ecx rolw $8, %cx pextrw $5, %xmm0, %edx pextrw $4, %xmm0, %esi pextrw $3, %xmm0, %edi pextrw $2, %xmm0, %ebx movaps %xmm0, %xmm1 pinsrw $0, %ecx, %xmm1 pinsrw $1, %eax, %xmm1 rolw $8, %bx pinsrw $2, %ebx, %xmm1 rolw $8, %di pinsrw $3, %edi, %xmm1 rolw $8, %si pinsrw $4, %esi, %xmm1 rolw $8, %dx pinsrw $5, %edx, %xmm1 pextrw $7, %xmm0, %eax rolw $8, %ax movaps %xmm1, %xmm0 pinsrw $7, %eax, %xmm0 popl %esi popl %edi popl %ebx ret Old: _shuf1: subl $252, %esp movaps %xmm0, (%esp) movaps %xmm0, 16(%esp) movaps %xmm0, 32(%esp) movaps %xmm0, 48(%esp) movaps %xmm0, 64(%esp) movaps %xmm0, 80(%esp) movaps %xmm0, 96(%esp) movaps %xmm0, 224(%esp) movaps %xmm0, 208(%esp) movaps %xmm0, 192(%esp) movaps %xmm0, 176(%esp) movaps %xmm0, 160(%esp) movaps %xmm0, 144(%esp) movaps %xmm0, 128(%esp) movaps %xmm0, 112(%esp) movzbl 14(%esp), %eax movd %eax, %xmm1 movzbl 22(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm1, %xmm2 movzbl 42(%esp), %eax movd %eax, %xmm1 movzbl 50(%esp), %eax movd %eax, %xmm3 punpcklbw %xmm1, %xmm3 punpcklbw %xmm2, %xmm3 movzbl 77(%esp), %eax movd %eax, %xmm1 movzbl 84(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm1, %xmm2 movzbl 104(%esp), %eax movd %eax, %xmm1 punpcklbw %xmm1, %xmm0 punpcklbw %xmm2, %xmm0 movaps %xmm0, %xmm1 punpcklbw %xmm3, %xmm1 movzbl 127(%esp), %eax movd %eax, %xmm0 movzbl 135(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm0, %xmm2 movzbl 155(%esp), %eax movd %eax, %xmm0 movzbl 163(%esp), %eax movd %eax, %xmm3 punpcklbw %xmm0, %xmm3 punpcklbw %xmm2, %xmm3 movzbl 188(%esp), %eax movd %eax, %xmm0 movzbl 197(%esp), %eax movd %eax, %xmm2 punpcklbw %xmm0, %xmm2 movzbl 217(%esp), %eax movd %eax, %xmm4 movzbl 225(%esp), %eax movd %eax, %xmm0 punpcklbw %xmm4, %xmm0 punpcklbw %xmm2, %xmm0 punpcklbw %xmm3, %xmm0 punpcklbw %xmm1, %xmm0 addl $252, %esp ret llvm-svn: 65311 2009-02-23 09:49:38 +01:00			`; RUN: llvm-as < %s \| llc -march=x86 -mcpu=yonah -o %t -f`
Testcase for PR2585. llvm-svn: 55151 2008-08-22 01:04:49 +02:00			`; RUN: grep pshuflw %t \| count 1`
			`; RUN: grep pextrw %t \| count 2`
			`; RUN: grep pinsrw %t \| count 2`
			`; PR2585`

			`; FIXME: This testcase produces icky code. It can be made much better!`

			`external constant <4 x i32> ; <<4 x i32>*>:0 [#uses=1]`
			`external constant <4 x i16> ; <<4 x i16>*>:1 [#uses=1]`

			`define internal void @""() {`
			`load <4 x i32>* @0, align 16 ; <<4 x i32>>:1 [#uses=1]`
			`bitcast <4 x i32> %1 to <8 x i16> ; <<8 x i16>>:2[#uses=1]`
			`shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > ; <<8x i16>>:3 [#uses=1]`
			`bitcast <8 x i16> %3 to <2 x i64> ; <<2 x i64>>:4 [#uses=1]`
			`extractelement <2 x i64> %4, i32 0 ; <i64>:5 [#uses=1]`
			`bitcast i64 %5 to <4 x i16> ; <<4 x i16>>:6 [#uses=1]`
			`store <4 x i16> %6, <4 x i16>* @1, align 8`
			`ret void`
			`}`