1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-23 13:02:52 +02:00
llvm-mirror/test/CodeGen/X86/avx512-cvt.ll

2523 lines
85 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
; ALL-LABEL: sitof32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @sltof864(<8 x i64> %a) {
; NODQ-LABEL: sltof864:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: sltof864:
; DQ: # %bb.0:
; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; DQ-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
define <4 x double> @slto4f64(<4 x i64> %a) {
; NODQ-LABEL: slto4f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
; VLDQ-LABEL: slto4f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: slto4f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x double>
ret <4 x double> %b
}
define <2 x double> @slto2f64(<2 x i64> %a) {
; NODQ-LABEL: slto2f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; NODQ-NEXT: retq
;
; VLDQ-LABEL: slto2f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: slto2f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x double>
ret <2 x double> %b
}
define <2 x float> @sltof2f32(<2 x i64> %a) {
; NODQ-LABEL: sltof2f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; NODQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; NODQ-NEXT: retq
;
; VLDQ-LABEL: sltof2f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: sltof2f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
; NODQ-LABEL: slto4f32_mem:
; NODQ: # %bb.0:
; NODQ-NEXT: vmovdqu (%rdi), %ymm0
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NODQ-NEXT: vzeroupper
; NODQ-NEXT: retq
;
; VLDQ-LABEL: slto4f32_mem:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: slto4f32_mem:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
}
define <4 x i64> @f64to4sl(<4 x double> %a) {
; NODQ-LABEL: f64to4sl:
; NODQ: # %bb.0:
; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; NODQ-NEXT: vcvttsd2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; NODQ-NEXT: vcvttsd2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm0
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
; VLDQ-LABEL: f64to4sl:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: f64to4sl:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i64>
ret <4 x i64> %b
}
define <4 x i64> @f32to4sl(<4 x float> %a) {
; NODQ-LABEL: f32to4sl:
; NODQ: # %bb.0:
; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; NODQ-NEXT: vcvttss2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; NODQ-NEXT: vcvttss2si %xmm2, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; NODQ-NEXT: vcvttss2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; NODQ-NEXT: vcvttss2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm0
; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
; VLDQ-LABEL: f32to4sl:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: f32to4sl:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512DQ-NEXT: retq
%b = fptosi <4 x float> %a to <4 x i64>
ret <4 x i64> %b
}
define <4 x float> @slto4f32(<4 x i64> %a) {
; NODQ-LABEL: slto4f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NODQ-NEXT: vzeroupper
; NODQ-NEXT: retq
;
; VLDQ-LABEL: slto4f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: slto4f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <4 x float> @ulto4f32(<4 x i64> %a) {
; NODQ-LABEL: ulto4f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; NODQ-NEXT: vzeroupper
; NODQ-NEXT: retq
;
; VLDQ-LABEL: ulto4f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: ulto4f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <8 x double> @ulto8f64(<8 x i64> %a) {
; NODQ-LABEL: ulto8f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: ulto8f64:
; DQ: # %bb.0:
; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; DQ-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
define <16 x double> @ulto16f64(<16 x i64> %a) {
; NODQ-LABEL: ulto16f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
; NODQ-NEXT: retq
;
; DQ-LABEL: ulto16f64:
; DQ: # %bb.0:
; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1
; DQ-NEXT: retq
%b = uitofp <16 x i64> %a to <16 x double>
ret <16 x double> %b
}
define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
; ALL-LABEL: f64to16si:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: retq
%b = fptosi <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
define <16 x i8> @f32to16sc(<16 x float> %f) {
; ALL-LABEL: f32to16sc:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: vpmovdb %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%res = fptosi <16 x float> %f to <16 x i8>
ret <16 x i8> %res
}
define <16 x i16> @f32to16ss(<16 x float> %f) {
; ALL-LABEL: f32to16ss:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: vpmovdw %zmm0, %ymm0
; ALL-NEXT: retq
%res = fptosi <16 x float> %f to <16 x i16>
ret <16 x i16> %res
}
define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
; ALL-LABEL: f32to16ui:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2udq %zmm0, %zmm0
; ALL-NEXT: retq
%b = fptoui <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
define <16 x i8> @f32to16uc(<16 x float> %f) {
; ALL-LABEL: f32to16uc:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: vpmovdb %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%res = fptoui <16 x float> %f to <16 x i8>
ret <16 x i8> %res
}
define <16 x i16> @f32to16us(<16 x float> %f) {
; ALL-LABEL: f32to16us:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: vpmovdw %zmm0, %ymm0
; ALL-NEXT: retq
%res = fptoui <16 x float> %f to <16 x i16>
ret <16 x i16> %res
}
define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
; NOVL-LABEL: f32to8ui:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: f32to8ui:
; VL: # %bb.0:
; VL-NEXT: vcvttps2udq %ymm0, %ymm0
; VL-NEXT: retq
%b = fptoui <8 x float> %a to <8 x i32>
ret <8 x i32> %b
}
define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
; NOVL-LABEL: f32to4ui:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f32to4ui:
; VL: # %bb.0:
; VL-NEXT: vcvttps2udq %xmm0, %xmm0
; VL-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
; ALL-LABEL: f64to8ui:
; ALL: # %bb.0:
; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0
; ALL-NEXT: retq
%b = fptoui <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
define <8 x i16> @f64to8us(<8 x double> %f) {
; NOVL-LABEL: f64to8us:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
; NOVL-NEXT: vpmovdw %zmm0, %ymm0
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to8us:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; VL-NEXT: vpmovdw %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%res = fptoui <8 x double> %f to <8 x i16>
ret <8 x i16> %res
}
define <8 x i8> @f64to8uc(<8 x double> %f) {
; NOVL-LABEL: f64to8uc:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
; NOVL-NEXT: vpmovdw %zmm0, %ymm0
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to8uc:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; VL-NEXT: vpmovdw %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%res = fptoui <8 x double> %f to <8 x i8>
ret <8 x i8> %res
}
define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
; NOVL-LABEL: f64to4ui:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to4ui:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%b = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
define <8 x double> @sito8f64(<8 x i32> %a) {
; ALL-LABEL: sito8f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <8 x i32> %a to <8 x double>
ret <8 x double> %b
}
define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
; KNL-LABEL: i32to8f64_mask:
; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; VLBW-LABEL: i32to8f64_mask:
; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; VLBW-NEXT: retq
;
; VLNOBW-LABEL: i32to8f64_mask:
; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; VLNOBW-NEXT: retq
;
; AVX512DQ-LABEL: i32to8f64_mask:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: i32to8f64_mask:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%1 = bitcast i8 %c to <8 x i1>
%2 = sitofp <8 x i32> %b to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
ret <8 x double> %3
}
define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
; KNL-LABEL: sito8f64_maskz:
; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; VLBW-LABEL: sito8f64_maskz:
; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; VLBW-NEXT: retq
;
; VLNOBW-LABEL: sito8f64_maskz:
; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; VLNOBW-NEXT: retq
;
; AVX512DQ-LABEL: sito8f64_maskz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: sito8f64_maskz:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
%1 = bitcast i8 %b to <8 x i1>
%2 = sitofp <8 x i32> %a to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
ret <8 x double> %3
}
define <8 x i32> @f64to8si(<8 x double> %a) {
; ALL-LABEL: f64to8si:
; ALL: # %bb.0:
; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0
; ALL-NEXT: retq
%b = fptosi <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
define <4 x i32> @f64to4si(<4 x double> %a) {
; ALL-LABEL: f64to4si:
; ALL: # %bb.0:
; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
; ALL-LABEL: f64to16f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%a = fptrunc <16 x double> %b to <16 x float>
ret <16 x float> %a
}
define <4 x float> @f64to4f32(<4 x double> %b) {
; ALL-LABEL: f64to4f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
; NOVL-LABEL: f64to4f32_mask:
; NOVL: # %bb.0:
; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vptestmd %zmm1, %zmm1, %k1
; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: f64to4f32_mask:
; VL: # %bb.0:
; VL-NEXT: vpslld $31, %xmm1, %xmm1
; VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
; VL-NEXT: vzeroupper
; VL-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
; ALL-LABEL: f64tof32_inreg:
; ALL: # %bb.0:
; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
; ALL-NEXT: retq
%ext = extractelement <2 x double> %a0, i32 0
%cvt = fptrunc double %ext to float
%res = insertelement <4 x float> %a1, float %cvt, i32 0
ret <4 x float> %res
}
define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
; ALL-LABEL: f32to8f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
; ALL-NEXT: retq
%a = fpext <8 x float> %b to <8 x double>
ret <8 x double> %a
}
define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
; NOVL-LABEL: f32to4f64_mask:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vcmpltpd %zmm2, %zmm1, %k1
; NOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: f32to4f64_mask:
; VL: # %bb.0:
; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1
; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
; VL-NEXT: retq
%a = fpext <4 x float> %b to <4 x double>
%mask = fcmp ogt <4 x double> %a1, %b1
%c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
ret <4 x double> %c
}
define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
; ALL-LABEL: f32tof64_inreg:
; ALL: # %bb.0:
; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
%ext = extractelement <4 x float> %a1, i32 0
%cvt = fpext float %ext to double
%res = insertelement <2 x double> %a0, double %cvt, i32 0
ret <2 x double> %res
}
define double @sltof64_load(i64* nocapture %e) {
; ALL-LABEL: sltof64_load:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
%tmp1 = load i64, i64* %e, align 8
%conv = sitofp i64 %tmp1 to double
ret double %conv
}
define double @sitof64_load(i32* %e) {
; ALL-LABEL: sitof64_load:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
%tmp1 = load i32, i32* %e, align 4
%conv = sitofp i32 %tmp1 to double
ret double %conv
}
define float @sitof32_load(i32* %e) {
; ALL-LABEL: sitof32_load:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
%tmp1 = load i32, i32* %e, align 4
%conv = sitofp i32 %tmp1 to float
ret float %conv
}
define float @sltof32_load(i64* %e) {
; ALL-LABEL: sltof32_load:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
%tmp1 = load i64, i64* %e, align 8
%conv = sitofp i64 %tmp1 to float
ret float %conv
}
define void @f32tof64_loadstore() {
; ALL-LABEL: f32tof64_loadstore:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp)
; ALL-NEXT: retq
entry:
%f = alloca float, align 4
%d = alloca double, align 8
%tmp = load float, float* %f, align 4
%conv = fpext float %tmp to double
store double %conv, double* %d, align 8
ret void
}
define void @f64tof32_loadstore() nounwind uwtable {
; ALL-LABEL: f64tof32_loadstore:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
; ALL-NEXT: retq
entry:
%f = alloca float, align 4
%d = alloca double, align 8
%tmp = load double, double* %d, align 8
%conv = fptrunc double %tmp to float
store float %conv, float* %f, align 4
ret void
}
define double @long_to_double(i64 %x) {
; ALL-LABEL: long_to_double:
; ALL: # %bb.0:
; ALL-NEXT: vmovq %rdi, %xmm0
; ALL-NEXT: retq
%res = bitcast i64 %x to double
ret double %res
}
define i64 @double_to_long(double %x) {
; ALL-LABEL: double_to_long:
; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: retq
%res = bitcast double %x to i64
ret i64 %res
}
define float @int_to_float(i32 %x) {
; ALL-LABEL: int_to_float:
; ALL: # %bb.0:
; ALL-NEXT: vmovd %edi, %xmm0
; ALL-NEXT: retq
%res = bitcast i32 %x to float
ret float %res
}
define i32 @float_to_int(float %x) {
; ALL-LABEL: float_to_int:
; ALL: # %bb.0:
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: retq
%res = bitcast float %x to i32
ret i32 %res
}
define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
; ALL-LABEL: uito16f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1
; ALL-NEXT: vmovaps %zmm2, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x double>
ret <16 x double> %b
}
define <8 x float> @slto8f32(<8 x i64> %a) {
; NODQ-LABEL: slto8f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
; DQ-LABEL: slto8f32:
; DQ: # %bb.0:
; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; DQ-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x float>
ret <8 x float> %b
}
define <16 x float> @slto16f32(<16 x i64> %a) {
; NODQ-LABEL: slto16f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: slto16f32:
; DQ: # %bb.0:
; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; DQ-NEXT: vcvtqq2ps %zmm1, %ymm1
; DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; DQ-NEXT: retq
%b = sitofp <16 x i64> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @slto8f64(<8 x i64> %a) {
; NODQ-LABEL: slto8f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: slto8f64:
; DQ: # %bb.0:
; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; DQ-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
define <16 x double> @slto16f64(<16 x i64> %a) {
; NODQ-LABEL: slto16f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0
; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2
; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1
; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
; NODQ-NEXT: retq
;
; DQ-LABEL: slto16f64:
; DQ: # %bb.0:
; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; DQ-NEXT: vcvtqq2pd %zmm1, %zmm1
; DQ-NEXT: retq
%b = sitofp <16 x i64> %a to <16 x double>
ret <16 x double> %b
}
define <8 x float> @ulto8f32(<8 x i64> %a) {
; NODQ-LABEL: ulto8f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
; DQ-LABEL: ulto8f32:
; DQ: # %bb.0:
; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; DQ-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x float>
ret <8 x float> %b
}
define <16 x float> @ulto16f32(<16 x i64> %a) {
; NODQ-LABEL: ulto16f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1
; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3
; NODQ-NEXT: vmovq %xmm3, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
; NODQ-NEXT: vpextrq $1, %xmm3, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0
; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: ulto16f32:
; DQ: # %bb.0:
; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; DQ-NEXT: vcvtuqq2ps %zmm1, %ymm1
; DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; DQ-NEXT: retq
%b = uitofp <16 x i64> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
; KNL-LABEL: uito8f64_mask:
; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; VLBW-LABEL: uito8f64_mask:
; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; VLBW-NEXT: retq
;
; VLNOBW-LABEL: uito8f64_mask:
; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; VLNOBW-NEXT: retq
;
; AVX512DQ-LABEL: uito8f64_mask:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: uito8f64_mask:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%1 = bitcast i8 %c to <8 x i1>
%2 = uitofp <8 x i32> %b to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
ret <8 x double> %3
}
define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
; KNL-LABEL: uito8f64_maskz:
; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; VLBW-LABEL: uito8f64_maskz:
; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; VLBW-NEXT: retq
;
; VLNOBW-LABEL: uito8f64_maskz:
; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; VLNOBW-NEXT: retq
;
; AVX512DQ-LABEL: uito8f64_maskz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: uito8f64_maskz:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
%1 = bitcast i8 %b to <8 x i1>
%2 = uitofp <8 x i32> %a to <8 x double>
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
ret <8 x double> %3
}
define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
; NOVL-LABEL: uito4f64:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: uito4f64:
; VL: # %bb.0:
; VL-NEXT: vcvtudq2pd %xmm0, %ymm0
; VL-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x double>
ret <4 x double> %b
}
define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
; ALL-LABEL: uito16f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @uito8f64(<8 x i32> %a) {
; ALL-LABEL: uito8f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <8 x i32> %a to <8 x double>
ret <8 x double> %b
}
define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
; NOVL-LABEL: uito8f32:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: uito8f32:
; VL: # %bb.0:
; VL-NEXT: vcvtudq2ps %ymm0, %ymm0
; VL-NEXT: retq
%b = uitofp <8 x i32> %a to <8 x float>
ret <8 x float> %b
}
define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
; NOVL-LABEL: uito4f32:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: uito4f32:
; VL: # %bb.0:
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
define i32 @fptosi(float %a) nounwind {
; ALL-LABEL: fptosi:
; ALL: # %bb.0:
; ALL-NEXT: vcvttss2si %xmm0, %eax
; ALL-NEXT: retq
%b = fptosi float %a to i32
ret i32 %b
}
define i32 @fptoui(float %a) nounwind {
; ALL-LABEL: fptoui:
; ALL: # %bb.0:
; ALL-NEXT: vcvttss2usi %xmm0, %eax
; ALL-NEXT: retq
%b = fptoui float %a to i32
ret i32 %b
}
define float @uitof32(i32 %a) nounwind {
; ALL-LABEL: uitof32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
; ALL-NEXT: retq
%b = uitofp i32 %a to float
ret float %b
}
define double @uitof64(i32 %a) nounwind {
; ALL-LABEL: uitof64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
; ALL-NEXT: retq
%b = uitofp i32 %a to double
ret double %b
}
define <16 x float> @sbto16f32(<16 x i32> %a) {
; NODQ-LABEL: sbto16f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: sbto16f32:
; DQ: # %bb.0:
; DQ-NEXT: vpmovd2m %zmm0, %k0
; DQ-NEXT: vpmovm2d %k0, %zmm0
; DQ-NEXT: vcvtdq2ps %zmm0, %zmm0
; DQ-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = sitofp <16 x i1> %mask to <16 x float>
ret <16 x float> %1
}
define <16 x float> @scto16f32(<16 x i8> %a) {
; ALL-LABEL: scto16f32:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%1 = sitofp <16 x i8> %a to <16 x float>
ret <16 x float> %1
}
define <16 x float> @ssto16f32(<16 x i16> %a) {
; ALL-LABEL: ssto16f32:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%1 = sitofp <16 x i16> %a to <16 x float>
ret <16 x float> %1
}
define <8 x double> @ssto16f64(<8 x i16> %a) {
; ALL-LABEL: ssto16f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%1 = sitofp <8 x i16> %a to <8 x double>
ret <8 x double> %1
}
define <8 x double> @scto8f64(<8 x i8> %a) {
; ALL-LABEL: scto8f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; ALL-NEXT: vpslld $24, %ymm0, %ymm0
; ALL-NEXT: vpsrad $24, %ymm0, %ymm0
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%1 = sitofp <8 x i8> %a to <8 x double>
ret <8 x double> %1
}
define <16 x double> @scto16f64(<16 x i8> %a) {
; ALL-LABEL: scto16f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbd %xmm0, %zmm1
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
; ALL-NEXT: retq
%b = sitofp <16 x i8> %a to <16 x double>
ret <16 x double> %b
}
define <16 x double> @sbto16f64(<16 x double> %a) {
; NOVLDQ-LABEL: sbto16f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto16f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0
; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1
; VLDQ-NEXT: vpmovm2d %k1, %ymm0
; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLDQ-NEXT: vpmovm2d %k0, %ymm1
; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto16f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; VLNODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2
; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k2} {z}
; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: sbto16f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <16 x double> %a, zeroinitializer
%1 = sitofp <16 x i1> %cmpres to <16 x double>
ret <16 x double> %1
}
define <8 x double> @sbto8f64(<8 x double> %a) {
; NOVLDQ-LABEL: sbto8f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto8f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto8f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: sbto8f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <8 x double> %a, zeroinitializer
%1 = sitofp <8 x i1> %cmpres to <8 x double>
ret <8 x double> %1
}
define <8 x float> @sbto8f32(<8 x float> %a) {
; NOVLDQ-LABEL: sbto8f32:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto8f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto8f32:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: sbto8f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <8 x float> %a, zeroinitializer
%1 = sitofp <8 x i1> %cmpres to <8 x float>
ret <8 x float> %1
}
define <4 x float> @sbto4f32(<4 x float> %a) {
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-LABEL: sbto4f32:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; NOVLDQ-NEXT: vzeroupper
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto4f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto4f32:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLNODQ-NEXT: retq
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
;
; AVX512DQ-LABEL: sbto4f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <4 x float> %a, zeroinitializer
%1 = sitofp <4 x i1> %cmpres to <4 x float>
ret <4 x float> %1
}
define <4 x double> @sbto4f64(<4 x double> %a) {
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-LABEL: sbto4f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto4f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto4f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; VLNODQ-NEXT: retq
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
;
; AVX512DQ-LABEL: sbto4f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; AVX512DQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <4 x double> %a, zeroinitializer
%1 = sitofp <4 x i1> %cmpres to <4 x double>
ret <4 x double> %1
}
define <2 x float> @sbto2f32(<2 x float> %a) {
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-LABEL: sbto2f32:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; NOVLDQ-NEXT: vzeroupper
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto2f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto2f32:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLNODQ-NEXT: retq
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
;
; AVX512DQ-LABEL: sbto2f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <2 x float> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x float>
ret <2 x float> %1
}
define <2 x double> @sbto2f64(<2 x double> %a) {
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-LABEL: sbto2f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
; NOVLDQ-NEXT: vzeroupper
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: sbto2f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: sbto2f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0
; VLNODQ-NEXT: retq
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
;
; AVX512DQ-LABEL: sbto2f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; AVX512DQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%cmpres = fcmp ogt <2 x double> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x double>
ret <2 x double> %1
}
define <16 x float> @ucto16f32(<16 x i8> %a) {
; ALL-LABEL: ucto16f32:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i8> %a to <16 x float>
ret <16 x float>%b
}
define <8 x double> @ucto8f64(<8 x i8> %a) {
; ALL-LABEL: ucto8f64:
; ALL: # %bb.0:
; ALL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <8 x i8> %a to <8 x double>
ret <8 x double> %b
}
define <16 x float> @swto16f32(<16 x i16> %a) {
; ALL-LABEL: swto16f32:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <16 x i16> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @swto8f64(<8 x i16> %a) {
; ALL-LABEL: swto8f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <8 x i16> %a to <8 x double>
ret <8 x double> %b
}
define <16 x double> @swto16f64(<16 x i16> %a) {
; ALL-LABEL: swto16f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %ymm0, %zmm1
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
; ALL-NEXT: retq
%b = sitofp <16 x i16> %a to <16 x double>
ret <16 x double> %b
}
define <16 x double> @ucto16f64(<16 x i8> %a) {
; ALL-LABEL: ucto16f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
; ALL-NEXT: retq
%b = uitofp <16 x i8> %a to <16 x double>
ret <16 x double> %b
}
define <16 x float> @uwto16f32(<16 x i16> %a) {
; ALL-LABEL: uwto16f32:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i16> %a to <16 x float>
ret <16 x float> %b
}
define <8 x double> @uwto8f64(<8 x i16> %a) {
; ALL-LABEL: uwto8f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <8 x i16> %a to <8 x double>
ret <8 x double> %b
}
define <16 x double> @uwto16f64(<16 x i16> %a) {
; ALL-LABEL: uwto16f64:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
; ALL-NEXT: retq
%b = uitofp <16 x i16> %a to <16 x double>
ret <16 x double> %b
}
define <16 x float> @sito16f32(<16 x i32> %a) {
; ALL-LABEL: sito16f32:
; ALL: # %bb.0:
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
define <16 x double> @sito16f64(<16 x i32> %a) {
; ALL-LABEL: sito16f64:
; ALL: # %bb.0:
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1
; ALL-NEXT: vmovaps %zmm2, %zmm0
; ALL-NEXT: retq
%b = sitofp <16 x i32> %a to <16 x double>
ret <16 x double> %b
}
define <16 x float> @usto16f32(<16 x i16> %a) {
; ALL-LABEL: usto16f32:
; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i16> %a to <16 x float>
ret <16 x float> %b
}
define <16 x float> @ubto16f32(<16 x i32> %a) {
; NODQ-LABEL: ubto16f32:
; NODQ: # %bb.0:
; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0
; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: ubto16f32:
; DQ: # %bb.0:
; DQ-NEXT: vpmovd2m %zmm0, %k0
; DQ-NEXT: vpmovm2d %k0, %zmm0
; DQ-NEXT: vpsrld $31, %zmm0, %zmm0
; DQ-NEXT: vcvtdq2ps %zmm0, %zmm0
; DQ-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = uitofp <16 x i1> %mask to <16 x float>
ret <16 x float> %1
}
define <16 x double> @ubto16f64(<16 x i32> %a) {
; NOVLDQ-LABEL: ubto16f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0
; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVLDQ-NEXT: kshiftrw $8, %k1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %ymm1, %ymm1
; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto16f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpmovd2m %zmm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0
; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLDQ-NEXT: kshiftrw $8, %k0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm1
; VLDQ-NEXT: vpsrld $31, %ymm1, %ymm1
; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto16f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0
; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLNODQ-NEXT: kshiftrw $8, %k1, %k1
; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %ymm1, %ymm1
; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto16f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpsrld $31, %ymm1, %ymm1
; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; AVX512DQ-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = uitofp <16 x i1> %mask to <16 x double>
ret <16 x double> %1
}
define <8 x float> @ubto8f32(<8 x i32> %a) {
; NOVLDQ-LABEL: ubto8f32:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0
; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto8f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpmovd2m %ymm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0
; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto8f32:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0
; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto8f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512DQ-NEXT: retq
%mask = icmp slt <8 x i32> %a, zeroinitializer
%1 = uitofp <8 x i1> %mask to <8 x float>
ret <8 x float> %1
}
define <8 x double> @ubto8f64(<8 x i32> %a) {
; NOVLDQ-LABEL: ubto8f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %ymm0, %ymm0
; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto8f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpmovd2m %ymm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vpsrld $31, %ymm0, %ymm0
; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto8f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %ymm0, %ymm0
; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto8f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: retq
%mask = icmp slt <8 x i32> %a, zeroinitializer
%1 = uitofp <8 x i1> %mask to <8 x double>
ret <8 x double> %1
}
define <4 x float> @ubto4f32(<4 x i32> %a) {
; NOVLDQ-LABEL: ubto4f32:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; NOVLDQ-NEXT: vzeroupper
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto4f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpmovd2m %xmm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto4f32:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto4f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%mask = icmp slt <4 x i32> %a, zeroinitializer
%1 = uitofp <4 x i1> %mask to <4 x float>
ret <4 x float> %1
}
define <4 x double> @ubto4f64(<4 x i32> %a) {
; NOVLDQ-LABEL: ubto4f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; NOVLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto4f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpmovd2m %xmm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto4f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto4f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512DQ-NEXT: retq
%mask = icmp slt <4 x i32> %a, zeroinitializer
%1 = uitofp <4 x i1> %mask to <4 x double>
ret <4 x double> %1
}
define <2 x float> @ubto2f32(<2 x i32> %a) {
; NOVLDQ-LABEL: ubto2f32:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; NOVLDQ-NEXT: vptestmq %zmm0, %zmm0, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; NOVLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; NOVLDQ-NEXT: vzeroupper
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto2f32:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VLDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto2f32:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VLNODQ-NEXT: vptestmq %xmm0, %xmm0, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto2f32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%mask = icmp ne <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
}
define <2 x double> @ubto2f64(<2 x i32> %a) {
; NOVLDQ-LABEL: ubto2f64:
; NOVLDQ: # %bb.0:
; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; NOVLDQ-NEXT: vptestmq %zmm0, %zmm0, %k1
; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; NOVLDQ-NEXT: vzeroupper
; NOVLDQ-NEXT: retq
;
; VLDQ-LABEL: ubto2f64:
; VLDQ: # %bb.0:
; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VLDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; VLNODQ-LABEL: ubto2f64:
; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VLNODQ-NEXT: vptestmq %xmm0, %xmm0, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vpsrld $31, %xmm0, %xmm0
; VLNODQ-NEXT: vcvtudq2pd %xmm0, %xmm0
; VLNODQ-NEXT: retq
;
; AVX512DQ-LABEL: ubto2f64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%mask = icmp ne <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x double>
ret <2 x double> %1
}
define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
; NOVL-LABEL: test_2f64toub:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: test_2f64toub:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2udq %xmm0, %xmm0
; VL-NEXT: vpslld $31, %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptoui <2 x double> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
; NOVL-LABEL: test_4f64toub:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: test_4f64toub:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %ymm0, %xmm0
; VL-NEXT: vpslld $31, %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptoui <4 x double> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
; NOVL-LABEL: test_8f64toub:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: retq
;
; VL-LABEL: test_8f64toub:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; VL-NEXT: vpslld $31, %ymm0, %ymm0
; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptoui <8 x double> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-LABEL: test_2f32toub:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: test_2f32toub:
; VL: # %bb.0:
; VL-NEXT: vcvttps2dq %xmm0, %xmm0
; VL-NEXT: vpslld $31, %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptoui <2 x float> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
; NOVL-LABEL: test_4f32toub:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: test_4f32toub:
; VL: # %bb.0:
; VL-NEXT: vcvttps2dq %xmm0, %xmm0
; VL-NEXT: vpslld $31, %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptoui <4 x float> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
; NOVL-LABEL: test_8f32toub:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0
; NOVL-NEXT: vpslld $31, %ymm0, %ymm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: retq
;
; VL-LABEL: test_8f32toub:
; VL: # %bb.0:
; VL-NEXT: vcvttps2dq %ymm0, %ymm0
; VL-NEXT: vpslld $31, %ymm0, %ymm0
; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptoui <8 x float> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
; ALL-LABEL: test_16f32toub:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: vpslld $31, %zmm0, %zmm0
; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
; ALL-NEXT: retq
%mask = fptoui <16 x float> %a to <16 x i1>
%select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
ret <16 x i32> %select
}
define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
; NOVL-LABEL: test_2f64tosb:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; NOVL-NEXT: vcvttpd2dq %xmm0, %xmm0
; NOVL-NEXT: vpslld $31, %xmm0, %xmm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: test_2f64tosb:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %xmm0, %xmm0
; VL-NEXT: vpslld $31, %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptosi <2 x double> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
; NOVL-LABEL: test_4f64tosb:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; NOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: test_4f64tosb:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %ymm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptosi <4 x double> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
; NOVL-LABEL: test_8f64tosb:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: retq
;
; VL-LABEL: test_8f64tosb:
; VL: # %bb.0:
; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptosi <8 x double> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-LABEL: test_2f32tosb:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vzeroupper
; NOVL-NEXT: retq
;
; VL-LABEL: test_2f32tosb:
; VL: # %bb.0:
; VL-NEXT: vcvttps2dq %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptosi <2 x float> %a to <2 x i1>
%select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
ret <2 x i64> %select
}
define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
; NOVL-LABEL: test_4f32tosb:
; NOVL: # %bb.0:
; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; NOVL-NEXT: vcvttps2dq %xmm0, %xmm0
[X86] Make v2i1 and v4i1 legal types without VLX Summary: There are few oddities that occur due to v1i1, v8i1, v16i1 being legal without v2i1 and v4i1 being legal when we don't have VLX. Particularly during legalization of v2i32/v4i32/v2i64/v4i64 masked gather/scatter/load/store. We end up promoting the mask argument to these during type legalization and then have to widen the promoted type to v8iX/v16iX and truncate it to get the element size back down to v8i1/v16i1 to use a 512-bit operation. Since need to fill the upper bits of the mask we have to fill with 0s at the promoted type. It would be better if we could just have the v2i1/v4i1 types as legal so they don't undergo any promotion. Then we can just widen with 0s directly in a k register. There are no real v4i1/v2i1 instructions anyway. Everything is done on a larger register anyway. This also fixes an issue that we couldn't implement a masked vextractf32x4 from zmm to xmm properly. We now have to support widening more compares to 512-bit to get a mask result out so new tablegen patterns got added. I had to hack the legalizer for widening the operand of a setcc a bit so it didn't try create a setcc returning v4i32, extract from it, then try to promote it using a sign extend to v2i1. Now we create the setcc with v4i1 if the original setcc's result type is v2i1. Then extract that and don't sign extend it at all. There's definitely room for improvement with some follow up patches. Reviewers: RKSimon, zvi, guyblank Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41560 llvm-svn: 321967
2018-01-07 19:20:37 +01:00
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; NOVL-NEXT: retq
;
; VL-LABEL: test_4f32tosb:
; VL: # %bb.0:
; VL-NEXT: vcvttps2dq %xmm0, %xmm0
; VL-NEXT: vptestmd %xmm0, %xmm0, %k1
; VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptosi <4 x float> %a to <4 x i1>
%select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
ret <4 x i64> %select
}
define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
; NOVL-LABEL: test_8f32tosb:
; NOVL: # %bb.0:
; NOVL-NEXT: vcvttps2dq %ymm0, %ymm0
; NOVL-NEXT: vptestmd %zmm0, %zmm0, %k1
; NOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; NOVL-NEXT: retq
;
; VL-LABEL: test_8f32tosb:
; VL: # %bb.0:
; VL-NEXT: vcvttps2dq %ymm0, %ymm0
; VL-NEXT: vptestmd %ymm0, %ymm0, %k1
; VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
; VL-NEXT: retq
%mask = fptosi <8 x float> %a to <8 x i1>
%select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
ret <8 x i64> %select
}
define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
; ALL-LABEL: test_16f32tosb:
; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: vptestmd %zmm0, %zmm0, %k1
; ALL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
; ALL-NEXT: retq
%mask = fptosi <16 x float> %a to <16 x i1>
%select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
ret <16 x i32> %select
}