1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 12:43:36 +01:00

X86 Tests: Update more isel tests with FastVariableShuffle feature

Summary:
Added the FastVariableShuffle feature to cases that resembled processors
for which this fearure is on.
For AVX2 there are processors with and w/o this fearue enable.
For AVX512 only KNL does enable this feature so cases which only have
+avx512f were left without the FastVariableShuffle enabled.

Reviewers: RKSimon, craig.topper

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D41851

llvm-svn: 322090
This commit is contained in:
Zvi Rackover 2018-01-09 16:26:06 +00:00
parent 1d90025fc4
commit 3a6b18f0a0
29 changed files with 2674 additions and 1597 deletions

View File

@ -1,23 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
; X32-LABEL: trunc4:
; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
; X32-SLOW-LABEL: trunc4:
; X32-SLOW: # %bb.0:
; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; X32-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-SLOW-NEXT: vzeroupper
; X32-SLOW-NEXT: retl
;
; X64-LABEL: trunc4:
; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
; X32-FAST-LABEL: trunc4:
; X32-FAST: # %bb.0:
; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X32-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-FAST-NEXT: vzeroupper
; X32-FAST-NEXT: retl
;
; X64-SLOW-LABEL: trunc4:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; X64-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; X64-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-SLOW-NEXT: vzeroupper
; X64-SLOW-NEXT: retq
;
; X64-FAST-LABEL: trunc4:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-FAST-NEXT: vzeroupper
; X64-FAST-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
ret <4 x i32>%B
}

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
; AVX2 Logical Shift Left
@ -372,25 +374,45 @@ entry:
}
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X32-LABEL: srl_trunc_and_v4i64:
; X32: # %bb.0:
; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X32-NEXT: vpand %xmm2, %xmm1, %xmm1
; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
; X32-SLOW-LABEL: srl_trunc_and_v4i64:
; X32-SLOW: # %bb.0:
; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-SLOW-NEXT: vzeroupper
; X32-SLOW-NEXT: retl
;
; X64-LABEL: srl_trunc_and_v4i64:
; X64: # %bb.0:
; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
; X32-FAST-LABEL: srl_trunc_and_v4i64:
; X32-FAST: # %bb.0:
; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; X32-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; X32-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X32-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
; X32-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-FAST-NEXT: vzeroupper
; X32-FAST-NEXT: retl
;
; X64-SLOW-LABEL: srl_trunc_and_v4i64:
; X64-SLOW: # %bb.0:
; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-SLOW-NEXT: vzeroupper
; X64-SLOW-NEXT: retq
;
; X64-FAST-LABEL: srl_trunc_and_v4i64:
; X64-FAST: # %bb.0:
; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-FAST-NEXT: vzeroupper
; X64-FAST-NEXT: retq
%and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
%trunc = trunc <4 x i64> %and to <4 x i32>
%sra = lshr <4 x i32> %x, %trunc

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
@ -331,8 +331,8 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $24, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
@ -345,8 +345,8 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@ -541,8 +541,8 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $56, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm2
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
@ -555,8 +555,8 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl
; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@ -1134,8 +1134,8 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $24, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
@ -1147,8 +1147,8 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@ -1369,8 +1369,8 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $56, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
@ -1382,8 +1382,8 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-shuffle %s -o - | FileCheck %s
; FIXME: fixing PR34394 should fix the i32x2 memory cases resulting in a simple vbroadcasti32x2 instruction.
@ -459,8 +459,8 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i3
define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
; CHECK-LABEL: test_2xi32_to_8xi32_mem:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@ -470,8 +470,8 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@ -486,8 +486,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
@ -501,8 +501,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@ -517,8 +517,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
@ -532,8 +532,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@ -548,8 +548,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
@ -563,8 +563,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i3
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@ -579,8 +579,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32>
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX
attributes #0 = { nounwind }

View File

@ -2,8 +2,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512
;
; 128-bit vectors
@ -379,19 +380,31 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: ext_i32_32i8:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: ext_i32_32i8:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovd %edi, %xmm0
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: ext_i32_32i8:
; AVX512: # %bb.0:
@ -683,26 +696,43 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i64_64i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm0
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: ext_i64_64i8:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: ext_i64_64i8:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovq %rdi, %xmm0
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: ext_i64_64i8:
; AVX512: # %bb.0:

View File

@ -2,9 +2,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
;
; 128-bit vectors
@ -472,21 +473,35 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: ext_i32_32i8:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: ext_i32_32i8:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovd %edi, %xmm0
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: ext_i32_32i8:
; AVX512F: # %bb.0:
@ -885,31 +900,53 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i64_64i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm0
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: ext_i64_64i8:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0
; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: ext_i64_64i8:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovq %rdi, %xmm0
; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-FAST-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: ext_i64_64i8:
; AVX512F: # %bb.0:

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=AVX512VLCDBW
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=X86-AVX512VLCDBW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,X86-AVX512VLCDBW
define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) {
; AVX512CD-LABEL: test_mm_epi64:
@ -28,7 +28,7 @@ define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) {
; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; X86-AVX512VLCDBW-NEXT: retl
entry:
%0 = icmp eq <8 x i16> %a, %b
@ -122,7 +122,7 @@ define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) {
; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X86-AVX512VLCDBW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; X86-AVX512VLCDBW-NEXT: retl
@ -160,7 +160,7 @@ define <4 x i64> @test_mm256_epi64(<8 x i32> %a, <8 x i32> %b) {
; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X86-AVX512VLCDBW-NEXT: retl
entry:

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX-FAST
; fold (shl 0, x) -> 0
define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
@ -113,14 +114,23 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: pmulld %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_trunc_and:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: combine_vec_shl_trunc_and:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-FAST-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
; AVX-FAST-NEXT: retq
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = shl <4 x i32> %x, %2

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
; fold (sra 0, x) -> 0
define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
@ -180,14 +181,23 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_trunc_and:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-FAST-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = ashr <4 x i32> %x, %2
@ -213,14 +223,23 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_trunc_lshr:
; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
@ -247,13 +266,21 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_trunc_ashr:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
%1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>

View File

@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
; fold (srl 0, x) -> 0
define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
@ -215,14 +216,23 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr1:
; AVX: # %bb.0:
; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
@ -446,14 +456,23 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_and:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = lshr <4 x i32> %x, %2

View File

@ -4,7 +4,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
; SSE2-LABEL: insert_v2f64_z1:
@ -429,12 +430,24 @@ define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) {
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v16i8_z123456789ABCDEz:
; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
; AVX1-LABEL: insert_v16i8_z123456789ABCDEz:
; AVX1: # %bb.0:
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: xorl %eax, %eax
; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: retq
%1 = insertelement <16 x i8> %a, i8 0, i32 0
%2 = insertelement <16 x i8> %1, i8 0, i32 15
ret <16 x i8> %2
@ -479,16 +492,25 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
; AVX2: # %bb.0:
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: xorl %eax, %eax
; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
%1 = insertelement <32 x i8> %a, i8 0, i32 0
%2 = insertelement <32 x i8> %1, i8 0, i32 15
%3 = insertelement <32 x i8> %2, i8 0, i32 30

View File

@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP
define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
@ -154,16 +155,36 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; SSE42-NEXT: movq %xmm2, (%rdi)
; SSE42-NEXT: retq
;
; AVX-LABEL: v5i16:
; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX-NEXT: vpextrw $6, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm1, (%rdi)
; AVX-NEXT: retq
; AVX1-LABEL: v5i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi)
; AVX1-NEXT: vmovq %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: v5i16:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi)
; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi)
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: v5i16:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,12,13,14,15,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi)
; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi)
; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v5i16:
; XOP: # %bb.0:
@ -550,18 +571,30 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: v12i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: v12i16:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi)
; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi)
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: v12i16:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi)
; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi)
; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v12i16:
; XOP: # %bb.0:
@ -637,20 +670,35 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v12i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-NEXT: vmovaps %xmm2, 32(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: v12i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0
; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: v12i32:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2
; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0
; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v12i32:
; XOP: # %bb.0:
@ -1400,34 +1448,63 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i32_out:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rdi), %ymm0
; AVX2-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-NEXT: vmovups 64(%rdi), %ymm2
; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-NEXT: vmovups %ymm3, (%rsi)
; AVX2-NEXT: vmovups %ymm4, (%rdx)
; AVX2-NEXT: vmovups %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: interleave_24i32_out:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi)
; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx)
; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: interleave_24i32_out:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [0,1,0,3,0,1,4,7]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi)
; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx)
; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: interleave_24i32_out:
; XOP: # %bb.0:
@ -1603,33 +1680,61 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i32_in:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rsi), %ymm0
; AVX2-NEXT: vmovups (%rdx), %ymm1
; AVX2-NEXT: vmovups (%rcx), %ymm2
; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
; AVX2-NEXT: vmovups %ymm4, 64(%rdi)
; AVX2-NEXT: vmovups %ymm3, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: interleave_24i32_in:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0
; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1
; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi)
; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi)
; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: interleave_24i32_in:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0
; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1
; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2
; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm2
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX2-FAST-NEXT: vmovups %ymm1, 64(%rdi)
; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi)
; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: interleave_24i32_in:
; XOP: # %bb.0:

View File

@ -2,9 +2,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
define <8 x i16> @test1(<8 x i16> %x) nounwind {
; SSE-LABEL: test1:
@ -1872,32 +1873,58 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: psubus_8i64_max:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6
; AVX2-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: psubus_8i64_max:
; AVX2-SLOW: # %bb.0: # %vector.ph
; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm2, %ymm5
; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm6
; AVX2-SLOW-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm1, %ymm6
; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm4
; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm3, %ymm1
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: psubus_8i64_max:
; AVX2-FAST: # %bb.0: # %vector.ph
; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-FAST-NEXT: vpxor %ymm4, %ymm2, %ymm5
; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm6
; AVX2-FAST-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
; AVX2-FAST-NEXT: vpxor %ymm4, %ymm1, %ymm6
; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm4
; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm3, %ymm1
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: psubus_8i64_max:
; AVX512: # %bb.0: # %vector.ph

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST
; PR32449
define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
@ -27,12 +28,19 @@ define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind {
}
define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
; AVX2-LABEL: foo8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: foo8:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: foo8:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-FAST-NEXT: retq
%res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
%res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7>
store <8 x float> %res, <8 x float>* %p

View File

@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
@ -400,8 +400,8 @@ define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@ -460,8 +460,8 @@ define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@ -520,8 +520,8 @@ define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;

View File

@ -3,9 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
@ -405,10 +405,9 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@ -418,10 +417,9 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
@ -431,10 +429,9 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@ -510,10 +507,9 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
@ -589,10 +585,9 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@ -602,10 +597,9 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
@ -615,10 +609,9 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@ -748,10 +741,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@ -773,10 +765,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@ -1056,10 +1047,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@ -1081,10 +1071,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
@ -23,9 +23,9 @@ define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@ -47,9 +47,9 @@ define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@ -77,9 +77,9 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@ -111,15 +111,45 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
}
define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
; AVX512-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps (%rdi), %zmm0
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: vmovaps %ymm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %zmm0
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
store <8 x i32> %strided.vec, <8 x i32>* %S
@ -399,16 +429,14 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@ -420,16 +448,14 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@ -478,16 +504,14 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@ -499,16 +523,14 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@ -557,16 +579,14 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@ -578,16 +598,14 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)

View File

@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
; PR31551
; Pairs of shufflevector:trunc functions with functional equivalence.
@ -473,8 +473,8 @@ define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@ -533,8 +533,8 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;

View File

@ -3,9 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
; PR31551
; Pairs of shufflevector:trunc functions with functional equivalence.
@ -505,10 +505,9 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
; PR31551
; Pairs of shufflevector:trunc functions with functional equivalence.
@ -27,9 +27,9 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@ -51,9 +51,9 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@ -120,13 +120,14 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
;
; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovaps %ymm0, (%rsi)
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@ -134,10 +135,9 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
@ -174,15 +174,45 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
}
define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
; AVX512-LABEL: shuffle_v16i32_to_v8i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps (%rdi), %zmm0
; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: vmovaps %ymm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %zmm0
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i32> %strided.vec, <8 x i32>* %S
@ -326,16 +356,14 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@ -347,16 +375,14 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)

View File

@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
;
; Half to Float
@ -1189,22 +1190,38 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_2i16_to_2f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: movswl %ax, %ecx
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-NEXT: vmovd %ecx, %xmm1
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: cvt_2i16_to_2f64:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
; AVX2-SLOW-NEXT: movswl %ax, %ecx
; AVX2-SLOW-NEXT: shrl $16, %eax
; AVX2-SLOW-NEXT: cwtl
; AVX2-SLOW-NEXT: vmovd %eax, %xmm0
; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1
; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: cvt_2i16_to_2f64:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: vmovd %xmm0, %eax
; AVX2-FAST-NEXT: movswl %ax, %ecx
; AVX2-FAST-NEXT: shrl $16, %eax
; AVX2-FAST-NEXT: cwtl
; AVX2-FAST-NEXT: vmovd %eax, %xmm0
; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovd %ecx, %xmm1
; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: cvt_2i16_to_2f64:
; AVX512F: # %bb.0:

View File

@ -4,8 +4,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0001:
@ -1238,18 +1239,28 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4i32_z4zz:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX1OR2-NEXT: retq
; AVX1-LABEL: shuffle_v4i32_z4zz:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_z4zz:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
ret <4 x i32> %shuffle
@ -1284,18 +1295,28 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4i32_zz4z:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX1OR2-NEXT: retq
; AVX1-LABEL: shuffle_v4i32_zz4z:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_zz4z:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
ret <4 x i32> %shuffle
@ -1351,12 +1372,22 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2OR512VL-NEXT: retq
; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_z6zz:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
ret <4 x i32> %shuffle
}

View File

@ -3,7 +3,8 @@
; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
;
; Verify that the DAG combiner correctly folds bitwise operations across
; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
@ -2536,12 +2537,19 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_unneeded_subvector1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: combine_unneeded_subvector1:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
%b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %c
@ -2873,12 +2881,19 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR22412:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: PR22412:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: PR22412:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
entry:
%s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512F-LABEL: shuf2i1_1_0:
@ -309,10 +309,10 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
@ -324,10 +324,10 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper

File diff suppressed because it is too large Load Diff

View File

@ -3,11 +3,12 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; SSE-LABEL: trunc8i64_8i32:
@ -26,14 +27,22 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc8i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32:
; AVX512: # %bb.0: # %entry
@ -103,14 +112,22 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i32_ashr:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_ashr:
; AVX512: # %bb.0: # %entry
@ -148,16 +165,26 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i32_lshr:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_lshr:
; AVX512: # %bb.0: # %entry
@ -228,18 +255,30 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i16:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc8i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i16:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i16:
; AVX512: # %bb.0: # %entry
@ -283,19 +322,32 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc8i64_8i8:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i8:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i8:
; AVX512: # %bb.0: # %entry
@ -1368,14 +1420,22 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc2x4i64_8i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc2x4i64_8i32:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i64_8i32:
; AVX512F: # %bb.0: # %entry
@ -1474,18 +1534,30 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc2x4i64_8i16:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc2x4i64_8i16:
; AVX2-FAST: # %bb.0: # %entry
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i64_8i16:
; AVX512F: # %bb.0: # %entry
@ -1882,14 +1954,22 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32160:
; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
; AVX2-SLOW-LABEL: PR32160:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: PR32160:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: PR32160:
; AVX512F: # %bb.0:
@ -1903,8 +1983,7 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
; AVX512VL-LABEL: PR32160:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
; AVX512VL-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@ -1912,16 +1991,14 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: PR32160:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%shuf = trunc <8 x i32> %x to <8 x i16>

View File

@ -6,7 +6,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i16:
@ -1929,11 +1929,16 @@ define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: retq
; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
; AVX512BW-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
%Z = bitcast <8 x i16> %B to <4 x i32>