From bc277bc1b01a90e18b50a66ae7ea047191149d1c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 26 Nov 2017 20:03:53 +0000 Subject: [PATCH] [X86][SSE] Add SSE42 tests to the clear upper tests llvm-svn: 319003 --- .../X86/clear_upper_vector_element_bits.ll | 945 +++++++++++------- 1 file changed, 593 insertions(+), 352 deletions(-) diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index c9cd7e13a79..22ec4d392b7 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 @@ -8,10 +9,16 @@ ; define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind { -; SSE-LABEL: _clearupper2xi64a: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper2xi64a: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper2xi64a: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper2xi64a: ; AVX1: # BB#0: @@ -36,12 +43,19 @@ define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind { } define <4 x i64> @_clearupper4xi64a(<4 x i64>) nounwind { -; SSE-LABEL: _clearupper4xi64a: -; SSE: # BB#0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper4xi64a: +; SSE2: # BB#0: +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper4xi64a: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper4xi64a: ; AVX: # BB#0: @@ -68,10 +82,16 @@ define <4 x i64> @_clearupper4xi64a(<4 x i64>) nounwind { } define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind { -; SSE-LABEL: _clearupper4xi32a: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper4xi32a: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper4xi32a: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper4xi32a: ; AVX: # BB#0: @@ -98,12 +118,19 @@ define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind { } define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind { -; SSE-LABEL: _clearupper8xi32a: -; SSE: # BB#0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper8xi32a: +; SSE2: # BB#0: +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper8xi32a: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper8xi32a: ; AVX1: # BB#0: @@ -275,54 +302,59 @@ define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind { } define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind { -; SSE-LABEL: _clearupper16xi8a: -; SSE: # BB#0: -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper16xi8a: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper16xi8a: +; SSE42: # BB#0: +; SSE42-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper16xi8a: ; AVX: # BB#0: @@ -396,100 +428,107 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind { } define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind { -; SSE-LABEL: _clearupper32xi8a: -; SSE: # BB#0: -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm6 -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper32xi8a: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper32xi8a: +; SSE42: # BB#0: +; SSE42-NEXT: movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: andps %xmm2, %xmm0 +; SSE42-NEXT: andps %xmm2, %xmm1 +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper32xi8a: ; AVX: # BB#0: @@ -627,10 +666,16 @@ define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind { } define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { -; SSE-LABEL: _clearupper2xi64b: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper2xi64b: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper2xi64b: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper2xi64b: ; AVX1: # BB#0: @@ -651,12 +696,19 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { } define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind { -; SSE-LABEL: _clearupper4xi64b: -; SSE: # BB#0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper4xi64b: +; SSE2: # BB#0: +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper4xi64b: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper4xi64b: ; AVX: # BB#0: @@ -673,10 +725,16 @@ define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind { } define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind { -; SSE-LABEL: _clearupper4xi32b: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper4xi32b: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper4xi32b: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper4xi32b: ; AVX: # BB#0: @@ -693,12 +751,19 @@ define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind { } define <8 x i32> @_clearupper8xi32b(<8 x i32>) nounwind { -; SSE-LABEL: _clearupper8xi32b: -; SSE: # BB#0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper8xi32b: +; SSE2: # BB#0: +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper8xi32b: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper8xi32b: ; AVX1: # BB#0: @@ -784,81 +849,156 @@ define <16 x i16> @_clearupper16xi16b(<16 x i16>) nounwind { } define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { -; SSE-LABEL: _clearupper16xi8b: -; SSE: # BB#0: -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: movq %rcx, %r9 -; SSE-NEXT: movq %rcx, %r10 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: movq %rcx, %rdi -; SSE-NEXT: andb $15, %cl -; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %xmm1, %rcx -; SSE-NEXT: shrq $56, %rdi -; SSE-NEXT: andb $15, %dil -; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: shrq $48, %rsi -; SSE-NEXT: andb $15, %sil -; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %r14 -; SSE-NEXT: shrq $40, %rdx -; SSE-NEXT: andb $15, %dl -; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: shrq $32, %rax -; SSE-NEXT: andb $15, %al -; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: shrq $24, %r10 -; SSE-NEXT: andb $15, %r10b -; SSE-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rdi -; SSE-NEXT: shrq $16, %r9 -; SSE-NEXT: andb $15, %r9b -; SSE-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: shrq $8, %r8 -; SSE-NEXT: andb $15, %r8b -; SSE-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rbx -; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andb $15, %cl -; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $56, %rbx -; SSE-NEXT: andb $15, %bl -; SSE-NEXT: movb %bl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $48, %rsi -; SSE-NEXT: andb $15, %sil -; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $40, %rdi -; SSE-NEXT: andb $15, %dil -; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $32, %rax -; SSE-NEXT: andb $15, %al -; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $24, %rdx -; SSE-NEXT: andb $15, %dl -; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $16, %r14 -; SSE-NEXT: andb $15, %r14b -; SSE-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $8, %r11 -; SSE-NEXT: andb $15, %r11b -; SSE-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper16xi8b: +; SSE2: # BB#0: +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: movq %rcx, %r8 +; SSE2-NEXT: movq %rcx, %r9 +; SSE2-NEXT: movq %rcx, %r10 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: movq %rcx, %rsi +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: andb $15, %cl +; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: shrq $56, %rdi +; SSE2-NEXT: andb $15, %dil +; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: shrq $48, %rsi +; SSE2-NEXT: andb $15, %sil +; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %r14 +; SSE2-NEXT: shrq $40, %rdx +; SSE2-NEXT: andb $15, %dl +; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: andb $15, %al +; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: shrq $24, %r10 +; SSE2-NEXT: andb $15, %r10b +; SSE2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: shrq $16, %r9 +; SSE2-NEXT: andb $15, %r9b +; SSE2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rsi +; SSE2-NEXT: shrq $8, %r8 +; SSE2-NEXT: andb $15, %r8b +; SSE2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rbx +; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: andb $15, %cl +; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $56, %rbx +; SSE2-NEXT: andb $15, %bl +; SSE2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $48, %rsi +; SSE2-NEXT: andb $15, %sil +; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $40, %rdi +; SSE2-NEXT: andb $15, %dil +; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: andb $15, %al +; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $24, %rdx +; SSE2-NEXT: andb $15, %dl +; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $16, %r14 +; SSE2-NEXT: andb $15, %r14b +; SSE2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $8, %r11 +; SSE2-NEXT: andb $15, %r11b +; SSE2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper16xi8b: +; SSE42: # BB#0: +; SSE42-NEXT: pushq %r14 +; SSE42-NEXT: pushq %rbx +; SSE42-NEXT: movq %xmm0, %rcx +; SSE42-NEXT: movq %rcx, %r8 +; SSE42-NEXT: movq %rcx, %r9 +; SSE42-NEXT: movq %rcx, %r10 +; SSE42-NEXT: movq %rcx, %rax +; SSE42-NEXT: movq %rcx, %rdx +; SSE42-NEXT: movq %rcx, %rsi +; SSE42-NEXT: movq %rcx, %rdi +; SSE42-NEXT: andb $15, %cl +; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: pextrq $1, %xmm0, %rcx +; SSE42-NEXT: shrq $56, %rdi +; SSE42-NEXT: andb $15, %dil +; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %r11 +; SSE42-NEXT: shrq $48, %rsi +; SSE42-NEXT: andb $15, %sil +; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %r14 +; SSE42-NEXT: shrq $40, %rdx +; SSE42-NEXT: andb $15, %dl +; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rdx +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: andb $15, %al +; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rax +; SSE42-NEXT: shrq $24, %r10 +; SSE42-NEXT: andb $15, %r10b +; SSE42-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rdi +; SSE42-NEXT: shrq $16, %r9 +; SSE42-NEXT: andb $15, %r9b +; SSE42-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rsi +; SSE42-NEXT: shrq $8, %r8 +; SSE42-NEXT: andb $15, %r8b +; SSE42-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rbx +; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: andb $15, %cl +; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $56, %rbx +; SSE42-NEXT: andb $15, %bl +; SSE42-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $48, %rsi +; SSE42-NEXT: andb $15, %sil +; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $40, %rdi +; SSE42-NEXT: andb $15, %dil +; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: andb $15, %al +; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $24, %rdx +; SSE42-NEXT: andb $15, %dl +; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $16, %r14 +; SSE42-NEXT: andb $15, %r14b +; SSE42-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $8, %r11 +; SSE42-NEXT: andb $15, %r11b +; SSE42-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: popq %rbx +; SSE42-NEXT: popq %r14 +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper16xi8b: ; AVX: # BB#0: @@ -962,81 +1102,156 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { } define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { -; SSE-LABEL: _clearupper32xi8b: -; SSE: # BB#0: -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: movq %rcx, %r9 -; SSE-NEXT: movq %rcx, %r10 -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: movq %rcx, %rdi -; SSE-NEXT: andb $15, %cl -; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %xmm2, %rcx -; SSE-NEXT: shrq $56, %rdi -; SSE-NEXT: andb $15, %dil -; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: shrq $48, %rsi -; SSE-NEXT: andb $15, %sil -; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %r14 -; SSE-NEXT: shrq $40, %rdx -; SSE-NEXT: andb $15, %dl -; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: shrq $32, %rax -; SSE-NEXT: andb $15, %al -; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rax -; SSE-NEXT: shrq $24, %r10 -; SSE-NEXT: andb $15, %r10b -; SSE-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rdi -; SSE-NEXT: shrq $16, %r9 -; SSE-NEXT: andb $15, %r9b -; SSE-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rsi -; SSE-NEXT: shrq $8, %r8 -; SSE-NEXT: andb $15, %r8b -; SSE-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rcx, %rbx -; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: andb $15, %cl -; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $56, %rbx -; SSE-NEXT: andb $15, %bl -; SSE-NEXT: movb %bl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $48, %rsi -; SSE-NEXT: andb $15, %sil -; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $40, %rdi -; SSE-NEXT: andb $15, %dil -; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $32, %rax -; SSE-NEXT: andb $15, %al -; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $24, %rdx -; SSE-NEXT: andb $15, %dl -; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $16, %r14 -; SSE-NEXT: andb $15, %r14b -; SSE-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: shrq $8, %r11 -; SSE-NEXT: andb $15, %r11b -; SSE-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper32xi8b: +; SSE2: # BB#0: +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: movq %rcx, %r8 +; SSE2-NEXT: movq %rcx, %r9 +; SSE2-NEXT: movq %rcx, %r10 +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: movq %rcx, %rsi +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: andb $15, %cl +; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %xmm2, %rcx +; SSE2-NEXT: shrq $56, %rdi +; SSE2-NEXT: andb $15, %dil +; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: shrq $48, %rsi +; SSE2-NEXT: andb $15, %sil +; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %r14 +; SSE2-NEXT: shrq $40, %rdx +; SSE2-NEXT: andb $15, %dl +; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: andb $15, %al +; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: shrq $24, %r10 +; SSE2-NEXT: andb $15, %r10b +; SSE2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rdi +; SSE2-NEXT: shrq $16, %r9 +; SSE2-NEXT: andb $15, %r9b +; SSE2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rsi +; SSE2-NEXT: shrq $8, %r8 +; SSE2-NEXT: andb $15, %r8b +; SSE2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movq %rcx, %rbx +; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: andb $15, %cl +; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $56, %rbx +; SSE2-NEXT: andb $15, %bl +; SSE2-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $48, %rsi +; SSE2-NEXT: andb $15, %sil +; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $40, %rdi +; SSE2-NEXT: andb $15, %dil +; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: andb $15, %al +; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $24, %rdx +; SSE2-NEXT: andb $15, %dl +; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $16, %r14 +; SSE2-NEXT: andb $15, %r14b +; SSE2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: shrq $8, %r11 +; SSE2-NEXT: andb $15, %r11b +; SSE2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper32xi8b: +; SSE42: # BB#0: +; SSE42-NEXT: pushq %r14 +; SSE42-NEXT: pushq %rbx +; SSE42-NEXT: movq %xmm0, %rcx +; SSE42-NEXT: movq %rcx, %r8 +; SSE42-NEXT: movq %rcx, %r9 +; SSE42-NEXT: movq %rcx, %r10 +; SSE42-NEXT: movq %rcx, %rax +; SSE42-NEXT: movq %rcx, %rdx +; SSE42-NEXT: movq %rcx, %rsi +; SSE42-NEXT: movq %rcx, %rdi +; SSE42-NEXT: andb $15, %cl +; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: pextrq $1, %xmm0, %rcx +; SSE42-NEXT: shrq $56, %rdi +; SSE42-NEXT: andb $15, %dil +; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %r11 +; SSE42-NEXT: shrq $48, %rsi +; SSE42-NEXT: andb $15, %sil +; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %r14 +; SSE42-NEXT: shrq $40, %rdx +; SSE42-NEXT: andb $15, %dl +; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rdx +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: andb $15, %al +; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rax +; SSE42-NEXT: shrq $24, %r10 +; SSE42-NEXT: andb $15, %r10b +; SSE42-NEXT: movb %r10b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rdi +; SSE42-NEXT: shrq $16, %r9 +; SSE42-NEXT: andb $15, %r9b +; SSE42-NEXT: movb %r9b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rsi +; SSE42-NEXT: shrq $8, %r8 +; SSE42-NEXT: andb $15, %r8b +; SSE42-NEXT: movb %r8b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movq %rcx, %rbx +; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: andb $15, %cl +; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $56, %rbx +; SSE42-NEXT: andb $15, %bl +; SSE42-NEXT: movb %bl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $48, %rsi +; SSE42-NEXT: andb $15, %sil +; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $40, %rdi +; SSE42-NEXT: andb $15, %dil +; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $32, %rax +; SSE42-NEXT: andb $15, %al +; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $24, %rdx +; SSE42-NEXT: andb $15, %dl +; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $16, %r14 +; SSE42-NEXT: andb $15, %r14b +; SSE42-NEXT: movb %r14b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: shrq $8, %r11 +; SSE42-NEXT: andb $15, %r11b +; SSE42-NEXT: movb %r11b, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp) +; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE42-NEXT: popq %rbx +; SSE42-NEXT: popq %r14 +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper32xi8b: ; AVX1: # BB#0: @@ -1417,10 +1632,16 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { } define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind { -; SSE-LABEL: _clearupper2xi64c: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper2xi64c: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper2xi64c: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper2xi64c: ; AVX1: # BB#0: @@ -1438,12 +1659,19 @@ define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind { } define <4 x i64> @_clearupper4xi64c(<4 x i64>) nounwind { -; SSE-LABEL: _clearupper4xi64c: -; SSE: # BB#0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper4xi64c: +; SSE2: # BB#0: +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper4xi64c: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper4xi64c: ; AVX: # BB#0: @@ -1455,10 +1683,16 @@ define <4 x i64> @_clearupper4xi64c(<4 x i64>) nounwind { } define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind { -; SSE-LABEL: _clearupper4xi32c: -; SSE: # BB#0: -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper4xi32c: +; SSE2: # BB#0: +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper4xi32c: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE42-NEXT: retq ; ; AVX-LABEL: _clearupper4xi32c: ; AVX: # BB#0: @@ -1470,12 +1704,19 @@ define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind { } define <8 x i32> @_clearupper8xi32c(<8 x i32>) nounwind { -; SSE-LABEL: _clearupper8xi32c: -; SSE: # BB#0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: _clearupper8xi32c: +; SSE2: # BB#0: +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE42-LABEL: _clearupper8xi32c: +; SSE42: # BB#0: +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE42-NEXT: retq ; ; AVX1-LABEL: _clearupper8xi32c: ; AVX1: # BB#0: