2009-09-09 01:54:48 +02:00
|
|
|
; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
|
Generate better code for v8i16 shuffles on SSE2
Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
cleanups.
New tests that test the above.
Examples:
New:
_shuf2:
pextrw $7, %xmm0, %eax
punpcklqdq %xmm1, %xmm0
pshuflw $128, %xmm0, %xmm0
pinsrw $2, %eax, %xmm0
Old:
_shuf2:
pextrw $2, %xmm0, %eax
pextrw $7, %xmm0, %ecx
pinsrw $2, %ecx, %xmm0
pinsrw $3, %eax, %xmm0
movd %xmm1, %eax
pinsrw $4, %eax, %xmm0
ret
=========
New:
_shuf4:
punpcklqdq %xmm1, %xmm0
pshufb LCPI1_0, %xmm0
Old:
_shuf4:
pextrw $3, %xmm0, %eax
movsd %xmm1, %xmm0
pextrw $3, %xmm1, %ecx
pinsrw $4, %ecx, %xmm0
pinsrw $5, %eax, %xmm0
========
New:
_shuf1:
pushl %ebx
pushl %edi
pushl %esi
pextrw $1, %xmm0, %eax
rolw $8, %ax
movd %xmm0, %ecx
rolw $8, %cx
pextrw $5, %xmm0, %edx
pextrw $4, %xmm0, %esi
pextrw $3, %xmm0, %edi
pextrw $2, %xmm0, %ebx
movaps %xmm0, %xmm1
pinsrw $0, %ecx, %xmm1
pinsrw $1, %eax, %xmm1
rolw $8, %bx
pinsrw $2, %ebx, %xmm1
rolw $8, %di
pinsrw $3, %edi, %xmm1
rolw $8, %si
pinsrw $4, %esi, %xmm1
rolw $8, %dx
pinsrw $5, %edx, %xmm1
pextrw $7, %xmm0, %eax
rolw $8, %ax
movaps %xmm1, %xmm0
pinsrw $7, %eax, %xmm0
popl %esi
popl %edi
popl %ebx
ret
Old:
_shuf1:
subl $252, %esp
movaps %xmm0, (%esp)
movaps %xmm0, 16(%esp)
movaps %xmm0, 32(%esp)
movaps %xmm0, 48(%esp)
movaps %xmm0, 64(%esp)
movaps %xmm0, 80(%esp)
movaps %xmm0, 96(%esp)
movaps %xmm0, 224(%esp)
movaps %xmm0, 208(%esp)
movaps %xmm0, 192(%esp)
movaps %xmm0, 176(%esp)
movaps %xmm0, 160(%esp)
movaps %xmm0, 144(%esp)
movaps %xmm0, 128(%esp)
movaps %xmm0, 112(%esp)
movzbl 14(%esp), %eax
movd %eax, %xmm1
movzbl 22(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm1, %xmm2
movzbl 42(%esp), %eax
movd %eax, %xmm1
movzbl 50(%esp), %eax
movd %eax, %xmm3
punpcklbw %xmm1, %xmm3
punpcklbw %xmm2, %xmm3
movzbl 77(%esp), %eax
movd %eax, %xmm1
movzbl 84(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm1, %xmm2
movzbl 104(%esp), %eax
movd %eax, %xmm1
punpcklbw %xmm1, %xmm0
punpcklbw %xmm2, %xmm0
movaps %xmm0, %xmm1
punpcklbw %xmm3, %xmm1
movzbl 127(%esp), %eax
movd %eax, %xmm0
movzbl 135(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl 155(%esp), %eax
movd %eax, %xmm0
movzbl 163(%esp), %eax
movd %eax, %xmm3
punpcklbw %xmm0, %xmm3
punpcklbw %xmm2, %xmm3
movzbl 188(%esp), %eax
movd %eax, %xmm0
movzbl 197(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl 217(%esp), %eax
movd %eax, %xmm4
movzbl 225(%esp), %eax
movd %eax, %xmm0
punpcklbw %xmm4, %xmm0
punpcklbw %xmm2, %xmm0
punpcklbw %xmm3, %xmm0
punpcklbw %xmm1, %xmm0
addl $252, %esp
ret
llvm-svn: 65311
2009-02-23 09:49:38 +01:00
|
|
|
; RUN: grep pextrw %t | count 13
|
|
|
|
; RUN: grep pinsrw %t | count 14
|
|
|
|
; RUN: grep rolw %t | count 13
|
|
|
|
; RUN: not grep esp %t
|
|
|
|
; RUN: not grep ebp %t
|
2009-09-09 01:54:48 +02:00
|
|
|
; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t
|
Generate better code for v8i16 shuffles on SSE2
Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
cleanups.
New tests that test the above.
Examples:
New:
_shuf2:
pextrw $7, %xmm0, %eax
punpcklqdq %xmm1, %xmm0
pshuflw $128, %xmm0, %xmm0
pinsrw $2, %eax, %xmm0
Old:
_shuf2:
pextrw $2, %xmm0, %eax
pextrw $7, %xmm0, %ecx
pinsrw $2, %ecx, %xmm0
pinsrw $3, %eax, %xmm0
movd %xmm1, %eax
pinsrw $4, %eax, %xmm0
ret
=========
New:
_shuf4:
punpcklqdq %xmm1, %xmm0
pshufb LCPI1_0, %xmm0
Old:
_shuf4:
pextrw $3, %xmm0, %eax
movsd %xmm1, %xmm0
pextrw $3, %xmm1, %ecx
pinsrw $4, %ecx, %xmm0
pinsrw $5, %eax, %xmm0
========
New:
_shuf1:
pushl %ebx
pushl %edi
pushl %esi
pextrw $1, %xmm0, %eax
rolw $8, %ax
movd %xmm0, %ecx
rolw $8, %cx
pextrw $5, %xmm0, %edx
pextrw $4, %xmm0, %esi
pextrw $3, %xmm0, %edi
pextrw $2, %xmm0, %ebx
movaps %xmm0, %xmm1
pinsrw $0, %ecx, %xmm1
pinsrw $1, %eax, %xmm1
rolw $8, %bx
pinsrw $2, %ebx, %xmm1
rolw $8, %di
pinsrw $3, %edi, %xmm1
rolw $8, %si
pinsrw $4, %esi, %xmm1
rolw $8, %dx
pinsrw $5, %edx, %xmm1
pextrw $7, %xmm0, %eax
rolw $8, %ax
movaps %xmm1, %xmm0
pinsrw $7, %eax, %xmm0
popl %esi
popl %edi
popl %ebx
ret
Old:
_shuf1:
subl $252, %esp
movaps %xmm0, (%esp)
movaps %xmm0, 16(%esp)
movaps %xmm0, 32(%esp)
movaps %xmm0, 48(%esp)
movaps %xmm0, 64(%esp)
movaps %xmm0, 80(%esp)
movaps %xmm0, 96(%esp)
movaps %xmm0, 224(%esp)
movaps %xmm0, 208(%esp)
movaps %xmm0, 192(%esp)
movaps %xmm0, 176(%esp)
movaps %xmm0, 160(%esp)
movaps %xmm0, 144(%esp)
movaps %xmm0, 128(%esp)
movaps %xmm0, 112(%esp)
movzbl 14(%esp), %eax
movd %eax, %xmm1
movzbl 22(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm1, %xmm2
movzbl 42(%esp), %eax
movd %eax, %xmm1
movzbl 50(%esp), %eax
movd %eax, %xmm3
punpcklbw %xmm1, %xmm3
punpcklbw %xmm2, %xmm3
movzbl 77(%esp), %eax
movd %eax, %xmm1
movzbl 84(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm1, %xmm2
movzbl 104(%esp), %eax
movd %eax, %xmm1
punpcklbw %xmm1, %xmm0
punpcklbw %xmm2, %xmm0
movaps %xmm0, %xmm1
punpcklbw %xmm3, %xmm1
movzbl 127(%esp), %eax
movd %eax, %xmm0
movzbl 135(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl 155(%esp), %eax
movd %eax, %xmm0
movzbl 163(%esp), %eax
movd %eax, %xmm3
punpcklbw %xmm0, %xmm3
punpcklbw %xmm2, %xmm3
movzbl 188(%esp), %eax
movd %eax, %xmm0
movzbl 197(%esp), %eax
movd %eax, %xmm2
punpcklbw %xmm0, %xmm2
movzbl 217(%esp), %eax
movd %eax, %xmm4
movzbl 225(%esp), %eax
movd %eax, %xmm0
punpcklbw %xmm4, %xmm0
punpcklbw %xmm2, %xmm0
punpcklbw %xmm3, %xmm0
punpcklbw %xmm1, %xmm0
addl $252, %esp
ret
llvm-svn: 65311
2009-02-23 09:49:38 +01:00
|
|
|
; RUN: grep pshufb %t | count 3
|
|
|
|
|
|
|
|
define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone {
|
|
|
|
entry:
|
|
|
|
%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
|
|
|
|
ret <16 x i8> %tmp8
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
|
|
|
|
entry:
|
|
|
|
%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
|
|
|
|
ret <16 x i8> %tmp8
|
|
|
|
}
|