1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00
llvm-mirror/test/CodeGen/X86/promote.ll
Simon Pilgrim 966ca434a4 [X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more leading zeros
As discussed in D41484, PMADDWD for 'zero extended' vXi32 is nearly always a better option than PMULLD:
On SNB it will result in code that isn't any faster, but not any slower so we may as well keep it.
On KNL it only has half the throughput, so I've disabled it on there - ideally there'd be a better way than this.

Differential Revision: https://reviews.llvm.org/D42258

llvm-svn: 323367
2018-01-24 19:20:02 +00:00

70 lines
2.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7 | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s --check-prefixes=CHECK,X64
define i32 @mul_f(<4 x i8>* %A) {
; X86-LABEL: mul_f:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-NEXT: pmaddwd %xmm0, %xmm0
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: mul_f:
; X64: # %bb.0: # %entry
; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-NEXT: pmaddwd %xmm0, %xmm0
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rax)
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
entry:
%0 = load <4 x i8>, <4 x i8>* %A, align 8
%mul = mul <4 x i8> %0, %0
store <4 x i8> %mul, <4 x i8>* undef
ret i32 0
}
define i32 @shuff_f(<4 x i8>* %A) {
; X86-LABEL: shuff_f:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X86-NEXT: paddd %xmm0, %xmm0
; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: movd %xmm0, (%eax)
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: shuff_f:
; X64: # %bb.0: # %entry
; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-NEXT: paddd %xmm0, %xmm0
; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: movd %xmm0, (%rax)
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
entry:
%0 = load <4 x i8>, <4 x i8>* %A, align 8
%add = add <4 x i8> %0, %0
store <4 x i8> %add, <4 x i8>* undef
ret i32 0
}
define <2 x float> @bitcast_widen(<4 x i32> %in) nounwind readnone {
; X86-LABEL: bitcast_widen:
; X86: # %bb.0: # %entry
; X86-NEXT: retl
;
; X64-LABEL: bitcast_widen:
; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%y = bitcast <2 x i32> %x to <2 x float>
ret <2 x float> %y
}