1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[X86] Teach combineCVTP2I_CVTTP2I to handle STRICT_CVTTP2SI/STRICT_CVTTP2UI

Allows us to shrink 128-bit simple load to enable folding for
v2f32->v2i64 vcvttps2qq/vcvttps2uqq.
This commit is contained in:
Craig Topper 2020-06-07 18:28:31 -07:00
parent a619e90821
commit 6d167c498f
2 changed files with 622 additions and 74 deletions

View File

@ -44765,11 +44765,11 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
// FIXME: Handle strict fp nodes.
bool IsStrict = N->isTargetStrictFPOpcode();
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
SDValue In = N->getOperand(0);
SDValue In = N->getOperand(IsStrict ? 1 : 0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
@ -44780,9 +44780,16 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
if (IsStrict) {
SDValue Convert =
DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
DCI.CombineTo(N, Convert, Convert.getValue(1));
} else {
SDValue Convert =
DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
}
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
DCI.recursivelyDeleteUnusedNodes(LN);
return SDValue(N, 0);
@ -47991,8 +47998,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
case X86ISD::CVTP2SI:
case X86ISD::CVTP2UI:
case X86ISD::STRICT_CVTTP2SI:
case X86ISD::CVTTP2SI:
case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::STRICT_CVTTP2UI:
case X86ISD::CVTTP2UI:
return combineCVTP2I_CVTTP2I(N, DAG, DCI);
case X86ISD::STRICT_CVTPH2PS:
case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);

View File

@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE-64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX512F-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX512F-64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512dq,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VLDQ,AVX512VLDQ-32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VLDQ,AVX512VLDQ-64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE-32
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE-64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-32
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX512F-32
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512F,AVX512F-64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-32
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-32
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=avx512dq -O3 | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-64
; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=avx512dq,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VLDQ,AVX512VLDQ-32
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=avx512dq,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512VLDQ,AVX512VLDQ-64
declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double>, metadata)
declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double>, metadata)
@ -703,6 +703,201 @@ define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 {
ret <2 x i64> %ret
}
define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64_load128(<4 x float>* %x) strictfp {
; SSE-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; SSE-32: # %bb.0:
; SSE-32-NEXT: pushl %ebp
; SSE-32-NEXT: .cfi_def_cfa_offset 8
; SSE-32-NEXT: .cfi_offset %ebp, -8
; SSE-32-NEXT: movl %esp, %ebp
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
; SSE-32-NEXT: movl 8(%ebp), %eax
; SSE-32-NEXT: movaps (%eax), %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00
; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: fnstcw (%esp)
; SSE-32-NEXT: movzwl (%esp), %eax
; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00
; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw (%esp)
; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: .cfi_def_cfa %esp, 4
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; SSE-64: # %bb.0:
; SSE-64-NEXT: movaps (%rdi), %xmm1
; SSE-64-NEXT: cvttss2si %xmm1, %rax
; SSE-64-NEXT: movq %rax, %xmm0
; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE-64-NEXT: cvttss2si %xmm1, %rax
; SSE-64-NEXT: movq %rax, %xmm1
; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX-32: # %bb.0:
; AVX-32-NEXT: pushl %ebp
; AVX-32-NEXT: .cfi_def_cfa_offset 8
; AVX-32-NEXT: .cfi_offset %ebp, -8
; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $16, %esp
; AVX-32-NEXT: movl 8(%ebp), %eax
; AVX-32-NEXT: vmovaps (%eax), %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: fisttpll (%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: movl %ebp, %esp
; AVX-32-NEXT: popl %ebp
; AVX-32-NEXT: .cfi_def_cfa %esp, 4
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vcvttss2si 4(%rdi), %rax
; AVX-64-NEXT: vmovq %rax, %xmm0
; AVX-64-NEXT: vcvttss2si (%rdi), %rax
; AVX-64-NEXT: vmovq %rax, %xmm1
; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-64-NEXT: retq
;
; AVX512F-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: pushl %ebp
; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: .cfi_offset %ebp, -8
; AVX512F-32-NEXT: movl %esp, %ebp
; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512F-32-NEXT: andl $-8, %esp
; AVX512F-32-NEXT: subl $16, %esp
; AVX512F-32-NEXT: movl 8(%ebp), %eax
; AVX512F-32-NEXT: vmovdqa (%eax), %xmm0
; AVX512F-32-NEXT: vmovd %xmm0, (%esp)
; AVX512F-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: flds (%esp)
; AVX512F-32-NEXT: fisttpll (%esp)
; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: wait
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512F-32-NEXT: movl %ebp, %esp
; AVX512F-32-NEXT: popl %ebp
; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
; AVX512F-32-NEXT: retl
;
; AVX512F-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vcvttss2si 4(%rdi), %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm0
; AVX512F-64-NEXT: vcvttss2si (%rdi), %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm1
; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-64-NEXT: retq
;
; AVX512VL-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512VL-32: # %bb.0:
; AVX512VL-32-NEXT: pushl %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8
; AVX512VL-32-NEXT: .cfi_offset %ebp, -8
; AVX512VL-32-NEXT: movl %esp, %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512VL-32-NEXT: andl $-8, %esp
; AVX512VL-32-NEXT: subl $16, %esp
; AVX512VL-32-NEXT: movl 8(%ebp), %eax
; AVX512VL-32-NEXT: vmovdqa (%eax), %xmm0
; AVX512VL-32-NEXT: vmovd %xmm0, (%esp)
; AVX512VL-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds (%esp)
; AVX512VL-32-NEXT: fisttpll (%esp)
; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512VL-32-NEXT: movl %ebp, %esp
; AVX512VL-32-NEXT: popl %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
; AVX512VL-32-NEXT: retl
;
; AVX512VL-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vcvttss2si 4(%rdi), %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm0
; AVX512VL-64-NEXT: vcvttss2si (%rdi), %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512DQ-32-NEXT: vmovdqa (%eax), %xmm0
; AVX512DQ-32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-32-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-32-NEXT: vzeroupper
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512DQ-64: # %bb.0:
; AVX512DQ-64-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-64-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-64-NEXT: vzeroupper
; AVX512DQ-64-NEXT: retq
;
; AVX512VLDQ-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512VLDQ-32: # %bb.0:
; AVX512VLDQ-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VLDQ-32-NEXT: vcvttps2qq (%eax), %xmm0
; AVX512VLDQ-32-NEXT: retl
;
; AVX512VLDQ-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64_load128:
; AVX512VLDQ-64: # %bb.0:
; AVX512VLDQ-64-NEXT: vcvttps2qq (%rdi), %xmm0
; AVX512VLDQ-64-NEXT: retq
%a = load <4 x float>, <4 x float>* %x
%b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%c = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float> %b, metadata !"fpexcept.strict") #0
ret <2 x i64> %c
}
define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
; SSE-32: # %bb.0:
@ -717,10 +912,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-32-NEXT: comiss %xmm2, %xmm0
; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: xorps %xmm3, %xmm3
; SSE-32-NEXT: jb .LBB3_2
; SSE-32-NEXT: jb .LBB4_2
; SSE-32-NEXT: # %bb.1:
; SSE-32-NEXT: movaps %xmm2, %xmm3
; SSE-32-NEXT: .LBB3_2:
; SSE-32-NEXT: .LBB4_2:
; SSE-32-NEXT: movaps %xmm0, %xmm4
; SSE-32-NEXT: subss %xmm3, %xmm4
; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
@ -736,10 +931,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-32-NEXT: comiss %xmm2, %xmm0
; SSE-32-NEXT: jb .LBB3_4
; SSE-32-NEXT: jb .LBB4_4
; SSE-32-NEXT: # %bb.3:
; SSE-32-NEXT: movaps %xmm2, %xmm1
; SSE-32-NEXT: .LBB3_4:
; SSE-32-NEXT: .LBB4_4:
; SSE-32-NEXT: subss %xmm1, %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %cl
@ -776,10 +971,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-64-NEXT: comiss %xmm3, %xmm0
; SSE-64-NEXT: xorps %xmm2, %xmm2
; SSE-64-NEXT: xorps %xmm1, %xmm1
; SSE-64-NEXT: jb .LBB3_2
; SSE-64-NEXT: jb .LBB4_2
; SSE-64-NEXT: # %bb.1:
; SSE-64-NEXT: movaps %xmm3, %xmm1
; SSE-64-NEXT: .LBB3_2:
; SSE-64-NEXT: .LBB4_2:
; SSE-64-NEXT: movaps %xmm0, %xmm4
; SSE-64-NEXT: subss %xmm1, %xmm4
; SSE-64-NEXT: cvttss2si %xmm4, %rax
@ -790,10 +985,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; SSE-64-NEXT: movq %rcx, %xmm1
; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-64-NEXT: comiss %xmm3, %xmm0
; SSE-64-NEXT: jb .LBB3_4
; SSE-64-NEXT: jb .LBB4_4
; SSE-64-NEXT: # %bb.3:
; SSE-64-NEXT: movaps %xmm3, %xmm2
; SSE-64-NEXT: .LBB3_4:
; SSE-64-NEXT: .LBB4_4:
; SSE-64-NEXT: subss %xmm2, %xmm0
; SSE-64-NEXT: cvttss2si %xmm0, %rax
; SSE-64-NEXT: setae %cl
@ -819,10 +1014,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX-32-NEXT: vcomiss %xmm1, %xmm3
; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX-32-NEXT: jb .LBB3_2
; AVX-32-NEXT: jb .LBB4_2
; AVX-32-NEXT: # %bb.1:
; AVX-32-NEXT: vmovaps %xmm1, %xmm4
; AVX-32-NEXT: .LBB3_2:
; AVX-32-NEXT: .LBB4_2:
; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
@ -833,10 +1028,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
; AVX-32-NEXT: jb .LBB3_4
; AVX-32-NEXT: jb .LBB4_4
; AVX-32-NEXT: # %bb.3:
; AVX-32-NEXT: vmovaps %xmm1, %xmm2
; AVX-32-NEXT: .LBB3_4:
; AVX-32-NEXT: .LBB4_4:
; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
@ -861,10 +1056,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX-64-NEXT: vcomiss %xmm1, %xmm0
; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-64-NEXT: jb .LBB3_2
; AVX-64-NEXT: jb .LBB4_2
; AVX-64-NEXT: # %bb.1:
; AVX-64-NEXT: vmovaps %xmm1, %xmm3
; AVX-64-NEXT: .LBB3_2:
; AVX-64-NEXT: .LBB4_2:
; AVX-64-NEXT: vsubss %xmm3, %xmm0, %xmm3
; AVX-64-NEXT: vcvttss2si %xmm3, %rax
; AVX-64-NEXT: setae %cl
@ -874,10 +1069,10 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
; AVX-64-NEXT: vmovq %rcx, %xmm3
; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX-64-NEXT: vcomiss %xmm1, %xmm0
; AVX-64-NEXT: jb .LBB3_4
; AVX-64-NEXT: jb .LBB4_4
; AVX-64-NEXT: # %bb.3:
; AVX-64-NEXT: vmovaps %xmm1, %xmm2
; AVX-64-NEXT: .LBB3_4:
; AVX-64-NEXT: .LBB4_4:
; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vcvttss2si %xmm0, %rax
; AVX-64-NEXT: setae %cl
@ -1022,6 +1217,349 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
ret <2 x i64> %ret
}
define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(<4 x float>* %x) strictfp {
; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; SSE-32: # %bb.0:
; SSE-32-NEXT: pushl %ebp
; SSE-32-NEXT: .cfi_def_cfa_offset 8
; SSE-32-NEXT: .cfi_offset %ebp, -8
; SSE-32-NEXT: movl %esp, %ebp
; SSE-32-NEXT: .cfi_def_cfa_register %ebp
; SSE-32-NEXT: andl $-8, %esp
; SSE-32-NEXT: subl $24, %esp
; SSE-32-NEXT: movl 8(%ebp), %eax
; SSE-32-NEXT: movaps (%eax), %xmm0
; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-32-NEXT: comiss %xmm2, %xmm0
; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: xorps %xmm3, %xmm3
; SSE-32-NEXT: jb .LBB5_2
; SSE-32-NEXT: # %bb.1:
; SSE-32-NEXT: movaps %xmm2, %xmm3
; SSE-32-NEXT: .LBB5_2:
; SSE-32-NEXT: movaps %xmm0, %xmm4
; SSE-32-NEXT: subss %xmm3, %xmm4
; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00
; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-32-NEXT: comiss %xmm2, %xmm0
; SSE-32-NEXT: jb .LBB5_4
; SSE-32-NEXT: # %bb.3:
; SSE-32-NEXT: movaps %xmm2, %xmm1
; SSE-32-NEXT: .LBB5_4:
; SSE-32-NEXT: subss %xmm1, %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %cl
; SSE-32-NEXT: flds {{[0-9]+}}(%esp)
; SSE-32-NEXT: wait
; SSE-32-NEXT: fnstcw (%esp)
; SSE-32-NEXT: movzwl (%esp), %edx
; SSE-32-NEXT: orl $3072, %edx # imm = 0xC00
; SSE-32-NEXT: movw %dx, {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp)
; SSE-32-NEXT: fldcw (%esp)
; SSE-32-NEXT: movzbl %al, %eax
; SSE-32-NEXT: shll $31, %eax
; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; SSE-32-NEXT: movd %eax, %xmm1
; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-32-NEXT: movzbl %cl, %eax
; SSE-32-NEXT: shll $31, %eax
; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; SSE-32-NEXT: movd %eax, %xmm1
; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-32-NEXT: movl %ebp, %esp
; SSE-32-NEXT: popl %ebp
; SSE-32-NEXT: .cfi_def_cfa %esp, 4
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; SSE-64: # %bb.0:
; SSE-64-NEXT: movaps (%rdi), %xmm1
; SSE-64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; SSE-64-NEXT: comiss %xmm3, %xmm1
; SSE-64-NEXT: xorps %xmm2, %xmm2
; SSE-64-NEXT: xorps %xmm0, %xmm0
; SSE-64-NEXT: jb .LBB5_2
; SSE-64-NEXT: # %bb.1:
; SSE-64-NEXT: movaps %xmm3, %xmm0
; SSE-64-NEXT: .LBB5_2:
; SSE-64-NEXT: movaps %xmm1, %xmm4
; SSE-64-NEXT: subss %xmm0, %xmm4
; SSE-64-NEXT: cvttss2si %xmm4, %rax
; SSE-64-NEXT: setae %cl
; SSE-64-NEXT: movzbl %cl, %ecx
; SSE-64-NEXT: shlq $63, %rcx
; SSE-64-NEXT: xorq %rax, %rcx
; SSE-64-NEXT: movq %rcx, %xmm0
; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE-64-NEXT: comiss %xmm3, %xmm1
; SSE-64-NEXT: jb .LBB5_4
; SSE-64-NEXT: # %bb.3:
; SSE-64-NEXT: movaps %xmm3, %xmm2
; SSE-64-NEXT: .LBB5_4:
; SSE-64-NEXT: subss %xmm2, %xmm1
; SSE-64-NEXT: cvttss2si %xmm1, %rax
; SSE-64-NEXT: setae %cl
; SSE-64-NEXT: movzbl %cl, %ecx
; SSE-64-NEXT: shlq $63, %rcx
; SSE-64-NEXT: xorq %rax, %rcx
; SSE-64-NEXT: movq %rcx, %xmm1
; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX-32: # %bb.0:
; AVX-32-NEXT: pushl %ebp
; AVX-32-NEXT: .cfi_def_cfa_offset 8
; AVX-32-NEXT: .cfi_offset %ebp, -8
; AVX-32-NEXT: movl %esp, %ebp
; AVX-32-NEXT: .cfi_def_cfa_register %ebp
; AVX-32-NEXT: andl $-8, %esp
; AVX-32-NEXT: subl $16, %esp
; AVX-32-NEXT: movl 8(%ebp), %eax
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-32-NEXT: vcomiss %xmm1, %xmm3
; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX-32-NEXT: jb .LBB5_2
; AVX-32-NEXT: # %bb.1:
; AVX-32-NEXT: vmovaps %xmm1, %xmm4
; AVX-32-NEXT: .LBB5_2:
; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: setae %al
; AVX-32-NEXT: movzbl %al, %eax
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
; AVX-32-NEXT: jb .LBB5_4
; AVX-32-NEXT: # %bb.3:
; AVX-32-NEXT: vmovaps %xmm1, %xmm2
; AVX-32-NEXT: .LBB5_4:
; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
; AVX-32-NEXT: fisttpll (%esp)
; AVX-32-NEXT: wait
; AVX-32-NEXT: setae %cl
; AVX-32-NEXT: movzbl %cl, %ecx
; AVX-32-NEXT: shll $31, %ecx
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX-32-NEXT: movl %ebp, %esp
; AVX-32-NEXT: popl %ebp
; AVX-32-NEXT: .cfi_def_cfa %esp, 4
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-64-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-64-NEXT: vcomiss %xmm1, %xmm3
; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-64-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX-64-NEXT: jb .LBB5_2
; AVX-64-NEXT: # %bb.1:
; AVX-64-NEXT: vmovaps %xmm1, %xmm4
; AVX-64-NEXT: .LBB5_2:
; AVX-64-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-64-NEXT: vcvttss2si %xmm3, %rax
; AVX-64-NEXT: setae %cl
; AVX-64-NEXT: movzbl %cl, %ecx
; AVX-64-NEXT: shlq $63, %rcx
; AVX-64-NEXT: xorq %rax, %rcx
; AVX-64-NEXT: vmovq %rcx, %xmm3
; AVX-64-NEXT: vcomiss %xmm1, %xmm0
; AVX-64-NEXT: jb .LBB5_4
; AVX-64-NEXT: # %bb.3:
; AVX-64-NEXT: vmovaps %xmm1, %xmm2
; AVX-64-NEXT: .LBB5_4:
; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vcvttss2si %xmm0, %rax
; AVX-64-NEXT: setae %cl
; AVX-64-NEXT: movzbl %cl, %ecx
; AVX-64-NEXT: shlq $63, %rcx
; AVX-64-NEXT: xorq %rax, %rcx
; AVX-64-NEXT: vmovq %rcx, %xmm0
; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-64-NEXT: retq
;
; AVX512F-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: pushl %ebp
; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: .cfi_offset %ebp, -8
; AVX512F-32-NEXT: movl %esp, %ebp
; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512F-32-NEXT: andl $-8, %esp
; AVX512F-32-NEXT: subl $16, %esp
; AVX512F-32-NEXT: movl 8(%ebp), %eax
; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: xorl %eax, %eax
; AVX512F-32-NEXT: vcomiss %xmm2, %xmm1
; AVX512F-32-NEXT: setb %cl
; AVX512F-32-NEXT: kmovw %ecx, %k1
; AVX512F-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX512F-32-NEXT: vmovaps %xmm2, %xmm4
; AVX512F-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
; AVX512F-32-NEXT: vsubss %xmm4, %xmm1, %xmm1
; AVX512F-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: wait
; AVX512F-32-NEXT: setae %al
; AVX512F-32-NEXT: shll $31, %eax
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: xorl %ecx, %ecx
; AVX512F-32-NEXT: vcomiss %xmm2, %xmm0
; AVX512F-32-NEXT: setb %dl
; AVX512F-32-NEXT: kmovw %edx, %k1
; AVX512F-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
; AVX512F-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX512F-32-NEXT: vmovss %xmm0, (%esp)
; AVX512F-32-NEXT: flds (%esp)
; AVX512F-32-NEXT: fisttpll (%esp)
; AVX512F-32-NEXT: wait
; AVX512F-32-NEXT: setae %cl
; AVX512F-32-NEXT: shll $31, %ecx
; AVX512F-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX512F-32-NEXT: movl %ebp, %esp
; AVX512F-32-NEXT: popl %ebp
; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
; AVX512F-32-NEXT: retl
;
; AVX512F-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vcvttss2usi 4(%rdi), %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm0
; AVX512F-64-NEXT: vcvttss2usi (%rdi), %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm1
; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-64-NEXT: retq
;
; AVX512VL-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512VL-32: # %bb.0:
; AVX512VL-32-NEXT: pushl %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8
; AVX512VL-32-NEXT: .cfi_offset %ebp, -8
; AVX512VL-32-NEXT: movl %esp, %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
; AVX512VL-32-NEXT: andl $-8, %esp
; AVX512VL-32-NEXT: subl $16, %esp
; AVX512VL-32-NEXT: movl 8(%ebp), %eax
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: xorl %eax, %eax
; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm1
; AVX512VL-32-NEXT: setb %cl
; AVX512VL-32-NEXT: kmovw %ecx, %k1
; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4
; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm1
; AVX512VL-32-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: wait
; AVX512VL-32-NEXT: setae %al
; AVX512VL-32-NEXT: shll $31, %eax
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX512VL-32-NEXT: xorl %ecx, %ecx
; AVX512VL-32-NEXT: vcomiss %xmm2, %xmm0
; AVX512VL-32-NEXT: setb %dl
; AVX512VL-32-NEXT: kmovw %edx, %k1
; AVX512VL-32-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
; AVX512VL-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX512VL-32-NEXT: vmovss %xmm0, (%esp)
; AVX512VL-32-NEXT: flds (%esp)
; AVX512VL-32-NEXT: fisttpll (%esp)
; AVX512VL-32-NEXT: wait
; AVX512VL-32-NEXT: setae %cl
; AVX512VL-32-NEXT: shll $31, %ecx
; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX512VL-32-NEXT: movl %ebp, %esp
; AVX512VL-32-NEXT: popl %ebp
; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
; AVX512VL-32-NEXT: retl
;
; AVX512VL-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vcvttss2usi 4(%rdi), %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm0
; AVX512VL-64-NEXT: vcvttss2usi (%rdi), %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-64-NEXT: retq
;
; AVX512DQ-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512DQ-32: # %bb.0:
; AVX512DQ-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512DQ-32-NEXT: vmovdqa (%eax), %xmm0
; AVX512DQ-32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-32-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-32-NEXT: vzeroupper
; AVX512DQ-32-NEXT: retl
;
; AVX512DQ-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512DQ-64: # %bb.0:
; AVX512DQ-64-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-64-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX512DQ-64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-64-NEXT: vzeroupper
; AVX512DQ-64-NEXT: retq
;
; AVX512VLDQ-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512VLDQ-32: # %bb.0:
; AVX512VLDQ-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VLDQ-32-NEXT: vcvttps2uqq (%eax), %xmm0
; AVX512VLDQ-32-NEXT: retl
;
; AVX512VLDQ-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
; AVX512VLDQ-64: # %bb.0:
; AVX512VLDQ-64-NEXT: vcvttps2uqq (%rdi), %xmm0
; AVX512VLDQ-64-NEXT: retq
%a = load <4 x float>, <4 x float>* %x
%b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%c = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float> %b, metadata !"fpexcept.strict") #0
ret <2 x i64> %c
}
define <2 x i32> @strict_vector_fptosi_v2f64_to_v2i32(<2 x double> %a) #0 {
; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i32:
; SSE-32: # %bb.0:
@ -1069,10 +1607,10 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 {
; SSE-32-NEXT: comisd %xmm3, %xmm0
; SSE-32-NEXT: xorpd %xmm2, %xmm2
; SSE-32-NEXT: xorpd %xmm1, %xmm1
; SSE-32-NEXT: jb .LBB5_2
; SSE-32-NEXT: jb .LBB7_2
; SSE-32-NEXT: # %bb.1:
; SSE-32-NEXT: movapd %xmm3, %xmm1
; SSE-32-NEXT: .LBB5_2:
; SSE-32-NEXT: .LBB7_2:
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: movzbl %al, %eax
; SSE-32-NEXT: shll $31, %eax
@ -1083,10 +1621,10 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 {
; SSE-32-NEXT: movd %ecx, %xmm1
; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-32-NEXT: comisd %xmm3, %xmm0
; SSE-32-NEXT: jb .LBB5_4
; SSE-32-NEXT: jb .LBB7_4
; SSE-32-NEXT: # %bb.3:
; SSE-32-NEXT: movapd %xmm3, %xmm2
; SSE-32-NEXT: .LBB5_4:
; SSE-32-NEXT: .LBB7_4:
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: movzbl %al, %eax
; SSE-32-NEXT: shll $31, %eax
@ -1225,10 +1763,10 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
; SSE-32-NEXT: comiss %xmm3, %xmm0
; SSE-32-NEXT: xorps %xmm2, %xmm2
; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: jb .LBB7_2
; SSE-32-NEXT: jb .LBB9_2
; SSE-32-NEXT: # %bb.1:
; SSE-32-NEXT: movaps %xmm3, %xmm1
; SSE-32-NEXT: .LBB7_2:
; SSE-32-NEXT: .LBB9_2:
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: movzbl %al, %eax
; SSE-32-NEXT: shll $31, %eax
@ -1239,10 +1777,10 @@ define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
; SSE-32-NEXT: movd %ecx, %xmm1
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-32-NEXT: comiss %xmm3, %xmm0
; SSE-32-NEXT: jb .LBB7_4
; SSE-32-NEXT: jb .LBB9_4
; SSE-32-NEXT: # %bb.3:
; SSE-32-NEXT: movaps %xmm3, %xmm2
; SSE-32-NEXT: .LBB7_4:
; SSE-32-NEXT: .LBB9_4:
; SSE-32-NEXT: setae %al
; SSE-32-NEXT: movzbl %al, %eax
; SSE-32-NEXT: shll $31, %eax
@ -1888,10 +2426,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; SSE-32-NEXT: comisd %xmm2, %xmm0
; SSE-32-NEXT: xorpd %xmm1, %xmm1
; SSE-32-NEXT: xorpd %xmm3, %xmm3
; SSE-32-NEXT: jb .LBB17_2
; SSE-32-NEXT: jb .LBB19_2
; SSE-32-NEXT: # %bb.1:
; SSE-32-NEXT: movapd %xmm2, %xmm3
; SSE-32-NEXT: .LBB17_2:
; SSE-32-NEXT: .LBB19_2:
; SSE-32-NEXT: movapd %xmm0, %xmm4
; SSE-32-NEXT: subsd %xmm3, %xmm4
; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp)
@ -1907,10 +2445,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-32-NEXT: comisd %xmm2, %xmm0
; SSE-32-NEXT: jb .LBB17_4
; SSE-32-NEXT: jb .LBB19_4
; SSE-32-NEXT: # %bb.3:
; SSE-32-NEXT: movapd %xmm2, %xmm1
; SSE-32-NEXT: .LBB17_4:
; SSE-32-NEXT: .LBB19_4:
; SSE-32-NEXT: subsd %xmm1, %xmm0
; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %cl
@ -1947,10 +2485,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; SSE-64-NEXT: comisd %xmm3, %xmm0
; SSE-64-NEXT: xorpd %xmm2, %xmm2
; SSE-64-NEXT: xorpd %xmm1, %xmm1
; SSE-64-NEXT: jb .LBB17_2
; SSE-64-NEXT: jb .LBB19_2
; SSE-64-NEXT: # %bb.1:
; SSE-64-NEXT: movapd %xmm3, %xmm1
; SSE-64-NEXT: .LBB17_2:
; SSE-64-NEXT: .LBB19_2:
; SSE-64-NEXT: movapd %xmm0, %xmm4
; SSE-64-NEXT: subsd %xmm1, %xmm4
; SSE-64-NEXT: cvttsd2si %xmm4, %rax
@ -1961,10 +2499,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; SSE-64-NEXT: movq %rcx, %xmm1
; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-64-NEXT: comisd %xmm3, %xmm0
; SSE-64-NEXT: jb .LBB17_4
; SSE-64-NEXT: jb .LBB19_4
; SSE-64-NEXT: # %bb.3:
; SSE-64-NEXT: movapd %xmm3, %xmm2
; SSE-64-NEXT: .LBB17_4:
; SSE-64-NEXT: .LBB19_4:
; SSE-64-NEXT: subsd %xmm2, %xmm0
; SSE-64-NEXT: cvttsd2si %xmm0, %rax
; SSE-64-NEXT: setae %cl
@ -1990,10 +2528,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; AVX-32-NEXT: vcomisd %xmm1, %xmm3
; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; AVX-32-NEXT: jb .LBB17_2
; AVX-32-NEXT: jb .LBB19_2
; AVX-32-NEXT: # %bb.1:
; AVX-32-NEXT: vmovapd %xmm1, %xmm4
; AVX-32-NEXT: .LBB17_2:
; AVX-32-NEXT: .LBB19_2:
; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
; AVX-32-NEXT: vmovsd %xmm3, (%esp)
; AVX-32-NEXT: fldl (%esp)
@ -2004,10 +2542,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomisd %xmm1, %xmm0
; AVX-32-NEXT: jb .LBB17_4
; AVX-32-NEXT: jb .LBB19_4
; AVX-32-NEXT: # %bb.3:
; AVX-32-NEXT: vmovapd %xmm1, %xmm2
; AVX-32-NEXT: .LBB17_4:
; AVX-32-NEXT: .LBB19_4:
; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0
; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp)
; AVX-32-NEXT: fldl {{[0-9]+}}(%esp)
@ -2032,10 +2570,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; AVX-64-NEXT: vcomisd %xmm1, %xmm0
; AVX-64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX-64-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX-64-NEXT: jb .LBB17_2
; AVX-64-NEXT: jb .LBB19_2
; AVX-64-NEXT: # %bb.1:
; AVX-64-NEXT: vmovapd %xmm1, %xmm3
; AVX-64-NEXT: .LBB17_2:
; AVX-64-NEXT: .LBB19_2:
; AVX-64-NEXT: vsubsd %xmm3, %xmm0, %xmm3
; AVX-64-NEXT: vcvttsd2si %xmm3, %rax
; AVX-64-NEXT: setae %cl
@ -2045,10 +2583,10 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
; AVX-64-NEXT: vmovq %rcx, %xmm3
; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-64-NEXT: vcomisd %xmm1, %xmm0
; AVX-64-NEXT: jb .LBB17_4
; AVX-64-NEXT: jb .LBB19_4
; AVX-64-NEXT: # %bb.3:
; AVX-64-NEXT: vmovapd %xmm1, %xmm2
; AVX-64-NEXT: .LBB17_4:
; AVX-64-NEXT: .LBB19_4:
; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vcvttsd2si %xmm0, %rax
; AVX-64-NEXT: setae %cl
@ -2264,10 +2802,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; SSE-32-NEXT: comiss %xmm2, %xmm0
; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: xorps %xmm3, %xmm3
; SSE-32-NEXT: jb .LBB19_2
; SSE-32-NEXT: jb .LBB21_2
; SSE-32-NEXT: # %bb.1:
; SSE-32-NEXT: movaps %xmm2, %xmm3
; SSE-32-NEXT: .LBB19_2:
; SSE-32-NEXT: .LBB21_2:
; SSE-32-NEXT: movaps %xmm0, %xmm4
; SSE-32-NEXT: subss %xmm3, %xmm4
; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
@ -2283,10 +2821,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp)
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-32-NEXT: comiss %xmm2, %xmm0
; SSE-32-NEXT: jb .LBB19_4
; SSE-32-NEXT: jb .LBB21_4
; SSE-32-NEXT: # %bb.3:
; SSE-32-NEXT: movaps %xmm2, %xmm1
; SSE-32-NEXT: .LBB19_4:
; SSE-32-NEXT: .LBB21_4:
; SSE-32-NEXT: subss %xmm1, %xmm0
; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; SSE-32-NEXT: setae %cl
@ -2323,10 +2861,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; SSE-64-NEXT: comiss %xmm3, %xmm0
; SSE-64-NEXT: xorps %xmm2, %xmm2
; SSE-64-NEXT: xorps %xmm1, %xmm1
; SSE-64-NEXT: jb .LBB19_2
; SSE-64-NEXT: jb .LBB21_2
; SSE-64-NEXT: # %bb.1:
; SSE-64-NEXT: movaps %xmm3, %xmm1
; SSE-64-NEXT: .LBB19_2:
; SSE-64-NEXT: .LBB21_2:
; SSE-64-NEXT: movaps %xmm0, %xmm4
; SSE-64-NEXT: subss %xmm1, %xmm4
; SSE-64-NEXT: cvttss2si %xmm4, %rax
@ -2337,10 +2875,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; SSE-64-NEXT: movq %rcx, %xmm1
; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-64-NEXT: comiss %xmm3, %xmm0
; SSE-64-NEXT: jb .LBB19_4
; SSE-64-NEXT: jb .LBB21_4
; SSE-64-NEXT: # %bb.3:
; SSE-64-NEXT: movaps %xmm3, %xmm2
; SSE-64-NEXT: .LBB19_4:
; SSE-64-NEXT: .LBB21_4:
; SSE-64-NEXT: subss %xmm2, %xmm0
; SSE-64-NEXT: cvttss2si %xmm0, %rax
; SSE-64-NEXT: setae %cl
@ -2366,10 +2904,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; AVX-32-NEXT: vcomiss %xmm1, %xmm3
; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4
; AVX-32-NEXT: jb .LBB19_2
; AVX-32-NEXT: jb .LBB21_2
; AVX-32-NEXT: # %bb.1:
; AVX-32-NEXT: vmovaps %xmm1, %xmm4
; AVX-32-NEXT: .LBB19_2:
; AVX-32-NEXT: .LBB21_2:
; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp)
; AVX-32-NEXT: flds {{[0-9]+}}(%esp)
@ -2380,10 +2918,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; AVX-32-NEXT: shll $31, %eax
; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: vcomiss %xmm1, %xmm0
; AVX-32-NEXT: jb .LBB19_4
; AVX-32-NEXT: jb .LBB21_4
; AVX-32-NEXT: # %bb.3:
; AVX-32-NEXT: vmovaps %xmm1, %xmm2
; AVX-32-NEXT: .LBB19_4:
; AVX-32-NEXT: .LBB21_4:
; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX-32-NEXT: vmovss %xmm0, (%esp)
; AVX-32-NEXT: flds (%esp)
@ -2408,10 +2946,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; AVX-64-NEXT: vcomiss %xmm1, %xmm0
; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX-64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX-64-NEXT: jb .LBB19_2
; AVX-64-NEXT: jb .LBB21_2
; AVX-64-NEXT: # %bb.1:
; AVX-64-NEXT: vmovaps %xmm1, %xmm3
; AVX-64-NEXT: .LBB19_2:
; AVX-64-NEXT: .LBB21_2:
; AVX-64-NEXT: vsubss %xmm3, %xmm0, %xmm3
; AVX-64-NEXT: vcvttss2si %xmm3, %rax
; AVX-64-NEXT: setae %cl
@ -2421,10 +2959,10 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
; AVX-64-NEXT: vmovq %rcx, %xmm3
; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX-64-NEXT: vcomiss %xmm1, %xmm0
; AVX-64-NEXT: jb .LBB19_4
; AVX-64-NEXT: jb .LBB21_4
; AVX-64-NEXT: # %bb.3:
; AVX-64-NEXT: vmovaps %xmm1, %xmm2
; AVX-64-NEXT: .LBB19_4:
; AVX-64-NEXT: .LBB21_4:
; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm0
; AVX-64-NEXT: vcvttss2si %xmm0, %rax
; AVX-64-NEXT: setae %cl