diff --git a/test/CodeGen/X86/sadd_sat_vec.ll b/test/CodeGen/X86/sadd_sat_vec.ll index 25e9a09c43d..2961129dfb1 100644 --- a/test/CodeGen/X86/sadd_sat_vec.ll +++ b/test/CodeGen/X86/sadd_sat_vec.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.sadd.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) @@ -111,10 +112,19 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX2-NEXT: vpaddsb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v64i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } @@ -191,10 +201,19 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX2-NEXT: vpaddsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v32i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z } @@ -551,15 +570,28 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX512-NEXT: vpmovb2m %xmm1, %k0 -; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: korw %k0, %k1, %k0 -; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovb2m %xmm1, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %xmm0, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %xmm0 +; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z } @@ -639,19 +671,30 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z } @@ -729,19 +772,30 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX2-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: retq %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z } @@ -866,19 +920,30 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX2-NEXT: vblendvps %ymm0, %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2} -; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vblendvps %ymm0, %ymm3, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512BW-NEXT: retq %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z } @@ -1221,19 +1286,29 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vblendvpd %xmm2, {{.*}}(%rip), %xmm3, %xmm3 +; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vblendvpd %xmm0, %xmm3, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z } @@ -1426,19 +1501,30 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k0 -; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2} -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vblendvpd %ymm2, %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vblendvpd %ymm0, %ymm3, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm2, %k0 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z } diff --git a/test/CodeGen/X86/ssub_sat_vec.ll b/test/CodeGen/X86/ssub_sat_vec.ll index c3612a7a538..784fa0e21ec 100644 --- a/test/CodeGen/X86/ssub_sat_vec.ll +++ b/test/CodeGen/X86/ssub_sat_vec.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.ssub.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) @@ -111,10 +112,19 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX2-NEXT: vpsubsb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v64i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsubsb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } @@ -191,10 +201,19 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX2-NEXT: vpsubsw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v32i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsubsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z } @@ -547,15 +566,28 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: kandnw %k0, %k1, %k0 -; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %xmm0, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512BW-NEXT: vpmovb2m %xmm0, %k1 +; AVX512BW-NEXT: kandnw %k0, %k1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %xmm0 +; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z } @@ -641,19 +673,32 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z } @@ -737,19 +782,32 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtd %xmm2, %xmm1, %k0 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm2, %k2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: retq %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z } @@ -883,19 +941,32 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2} -; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] +; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512F-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtd %ymm2, %ymm1, %k0 +; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm2, %k2 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX512BW-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k2} +; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512BW-NEXT: retq %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z } @@ -1280,19 +1351,31 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] -; AVX512-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vblendvpd %xmm1, {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtq %xmm2, %xmm1, %k0 +; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm2, %k2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z } @@ -1532,19 +1615,32 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpgtq %ymm2, %ymm1, %k0 -; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2} -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; AVX512-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] +; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpcmpgtq %ymm2, %ymm1, %k0 +; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1 +; AVX512BW-NEXT: kxorw %k1, %k0, %k1 +; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm2, %k2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX512BW-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z } diff --git a/test/CodeGen/X86/uadd_sat_vec.ll b/test/CodeGen/X86/uadd_sat_vec.ll index b398c44b4a0..3a4e5974289 100644 --- a/test/CodeGen/X86/uadd_sat_vec.ll +++ b/test/CodeGen/X86/uadd_sat_vec.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.uadd.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) @@ -111,10 +112,19 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX2-NEXT: vpaddusb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v64i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddusb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddusb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } @@ -191,10 +201,19 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX2-NEXT: vpaddusw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v32i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpaddusw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z } @@ -524,15 +543,28 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX512-NEXT: vpmovb2m %xmm1, %k0 -; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: korw %k0, %k1, %k0 -; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovb2m %xmm1, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %xmm0, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %xmm0 +; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z } @@ -584,13 +616,23 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512BW-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y) ret <2 x i32> %z } @@ -640,13 +682,23 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512BW-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %z = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y) ret <4 x i32> %z } @@ -719,13 +771,22 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm2 -; AVX512-NEXT: vpminud %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512BW-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm2 +; AVX512BW-NEXT: vpminud %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y) ret <8 x i32> %z } @@ -926,13 +987,24 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 -; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512BW-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpminuq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z } @@ -1063,13 +1135,23 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm2 -; AVX512-NEXT: vpminuq %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm2 +; AVX512F-NEXT: vpminuq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512BW-NEXT: vpternlogq $15, %ymm1, %ymm1, %ymm2 +; AVX512BW-NEXT: vpminuq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z } diff --git a/test/CodeGen/X86/usub_sat_vec.ll b/test/CodeGen/X86/usub_sat_vec.ll index d56f9150a84..d455a034f0c 100644 --- a/test/CodeGen/X86/usub_sat_vec.ll +++ b/test/CodeGen/X86/usub_sat_vec.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW declare <1 x i8> @llvm.usub.sat.v1i8(<1 x i8>, <1 x i8>) declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) @@ -111,10 +112,19 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v64i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsubusb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubusb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) ret <64 x i8> %z } @@ -191,10 +201,19 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v32i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsubusw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v32i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) ret <32 x i16> %z } @@ -524,15 +543,28 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0 -; AVX512-NEXT: vpmovb2m %xmm0, %k1 -; AVX512-NEXT: kandnw %k0, %k1, %k0 -; AVX512-NEXT: vpmovm2b %k0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovb2m %xmm0, %k0 +; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm0 +; AVX512BW-NEXT: vpmovb2m %xmm0, %k1 +; AVX512BW-NEXT: kandnw %k0, %k1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %xmm0 +; AVX512BW-NEXT: retq %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z } @@ -866,11 +898,20 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v2i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v2i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %z = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %x, <2 x i64> %y) ret <2 x i64> %z } @@ -998,11 +1039,19 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: v4i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v4i64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y) ret <4 x i64> %z }