1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00
llvm-mirror/test/CodeGen/X86/pr18344.ll
Simon Pilgrim 1d50b0c236 [X86][SSE] Don't colaesce v4i32 extracts
We currently coalesce v4i32 extracts from all 4 elements to 2 v2i64 extracts + shifts/sign-extends.

This seems to have been added back in the days when we tended to spill vectors and reload scalars, or ended up with repeated shuffles moving everything down to 0'th index. I don't think either of these are likely these days as we have better EXTRACT_VECTOR_ELT and VECTOR_SHUFFLE handling, and the existing code tends to make it very difficult for various vector and load combines.

Differential Revision: https://reviews.llvm.org/D42308

llvm-svn: 323541
2018-01-26 17:11:34 +00:00

92 lines
4.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
%v4_varying_complex = type { <4 x float>, <4 x float> }
define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noalias %re, <4 x i32>* noalias nocapture %ptr_cast_for_load) nounwind {
; X86-LABEL: FFT:
; X86: # %bb.0: # %begin
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movdqu (%edx), %xmm0
; X86-NEXT: pslld $4, %xmm0
; X86-NEXT: movd %xmm0, %edx
; X86-NEXT: pextrd $1, %xmm0, %esi
; X86-NEXT: pextrd $2, %xmm0, %edi
; X86-NEXT: pextrd $3, %xmm0, %ebx
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X86-NEXT: movss %xmm0, 128(%eax)
; X86-NEXT: movss %xmm1, 164(%eax)
; X86-NEXT: movss %xmm2, 200(%eax)
; X86-NEXT: movss %xmm3, 236(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: FFT:
; X64: # %bb.0: # %begin
; X64-NEXT: movdqu (%rdx), %xmm0
; X64-NEXT: pslld $4, %xmm0
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: movslq %eax, %r8
; X64-NEXT: pextrd $1, %xmm0, %ecx
; X64-NEXT: movslq %ecx, %rcx
; X64-NEXT: pextrd $2, %xmm0, %edx
; X64-NEXT: movslq %edx, %rdx
; X64-NEXT: pextrd $3, %xmm0, %eax
; X64-NEXT: cltq
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X64-NEXT: movss %xmm0, 128(%rdi)
; X64-NEXT: movss %xmm1, 164(%rdi)
; X64-NEXT: movss %xmm2, 200(%rdi)
; X64-NEXT: movss %xmm3, 236(%rdi)
; X64-NEXT: retq
begin:
%ptr_masked_load79 = load <4 x i32>, <4 x i32>* %ptr_cast_for_load, align 4
%mul__bitReversedProgramIndex_load = shl <4 x i32> %ptr_masked_load79, <i32 4, i32 4, i32 4, i32 4>
%offset32_1 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 0
%ptroffset_1 = sext i32 %offset32_1 to i64
%offset32_2 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 1
%ptroffset_2 = sext i32 %offset32_2 to i64
%offset32_3 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 2
%ptroffset_3 = sext i32 %offset32_3 to i64
%offset32_4 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 3
%ptroffset_4 = sext i32 %offset32_4 to i64
%ptrcast_1 = getelementptr float, float* %re, i64 %ptroffset_1
%val_1 = load float, float* %ptrcast_1, align 4
%ptrcast_2 = getelementptr float, float* %re, i64 %ptroffset_2
%val_2 = load float, float* %ptrcast_2, align 4
%ptrcast_3 = getelementptr float, float* %re, i64 %ptroffset_3
%val_3 = load float, float* %ptrcast_3, align 4
%ptrcast_4 = getelementptr float, float* %re, i64 %ptroffset_4
%val_4 = load float, float* %ptrcast_4, align 4
%destination_load_ptr2int_2void = bitcast %v4_varying_complex* %destination to i8*
%ptrcast1_1 = getelementptr inbounds %v4_varying_complex, %v4_varying_complex* %destination, i64 4, i32 0, i64 0
store float %val_1, float* %ptrcast1_1, align 4
%finalptr_2 = getelementptr i8, i8* %destination_load_ptr2int_2void, i64 164
%ptrcast1_2 = bitcast i8* %finalptr_2 to float*
store float %val_2, float* %ptrcast1_2, align 4
%finalptr_3 = getelementptr i8, i8* %destination_load_ptr2int_2void, i64 200
%ptrcast1_3 = bitcast i8* %finalptr_3 to float*
store float %val_3, float* %ptrcast1_3, align 4
%finalptr_4 = getelementptr i8, i8* %destination_load_ptr2int_2void, i64 236
%ptrcast1_4 = bitcast i8* %finalptr_4 to float*
store float %val_4, float* %ptrcast1_4, align 4
ret void
}