1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[X86][SSE] Added support for combining target shuffles to (V)PSHUFD/VPERMILPD/VPERMILPS immediate permutes

This patch allows target shuffles to be combined to single input immediate permute instructions - (V)PSHUFD/VPERMILPD/VPERMILPS - allowing more general pattern matching than what we current do and improves the likelihood of memory folding compared to existing patterns which tend to reuse the input in multiple arguments.

Further permute instructions (V)PSHUFLW/(V)PSHUFHW/(V)PERMQ/(V)PERMPD may be added in the future but its proven tricky to create tests cases for them so far. (V)PSHUFLW/(V)PSHUFHW is already handled quite well in combineTargetShuffle so it may be that removing some of that code may allow us to perform more of the combining in one place without duplication.

Differential Revision: http://reviews.llvm.org/D21148

llvm-svn: 273999
This commit is contained in:
Simon Pilgrim 2016-06-28 08:08:15 +00:00
parent d07acd7937
commit e24daf0c1e
22 changed files with 218 additions and 117 deletions

View File

@ -7148,8 +7148,7 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
/// example.
///
/// NB: We rely heavily on "undef" masks preserving the input lane.
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
SelectionDAG &DAG) {
static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
@ -7161,7 +7160,12 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
return DAG.getConstant(Imm, DL, MVT::i8);
return Imm;
}
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
SelectionDAG &DAG) {
return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
/// \brief Compute whether each element of a shuffle is zeroable.
@ -24529,7 +24533,8 @@ static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT) {
bool FloatDomain = SrcVT.isFloatingPoint();
bool FloatDomain = SrcVT.isFloatingPoint() ||
(!Subtarget.hasAVX2() && SrcVT.is256BitVector());
// Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
if (!FloatDomain && SrcVT.is128BitVector() &&
@ -24607,6 +24612,83 @@ static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
return false;
}
// Attempt to match a combined shuffle mask against supported unary immediate
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT,
unsigned &PermuteImm) {
// Ensure we don't contain any zero elements.
for (int M : Mask) {
if (M == SM_SentinelZero)
return false;
assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
"Expected unary shuffle");
}
// We only support permutation of 32/64 bit elements.
// TODO - support PSHUFLW/PSHUFHW.
unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
return false;
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
bool FloatDomain = SrcVT.isFloatingPoint();
if (FloatDomain && !Subtarget.hasAVX())
return false;
// Pre-AVX2 we must use float shuffles on 256-bit vectors.
if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
FloatDomain = true;
// TODO - support LaneCrossing for AVX2 PERMQ/PERMPD
if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask))
return false;
// VPERMILPD can permute with a non-repeating shuffle.
if (FloatDomain && MaskScalarSizeInBits == 64) {
Shuffle = X86ISD::VPERMILPI;
ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
PermuteImm = 0;
for (int i = 0, e = Mask.size(); i != e; ++i) {
int M = Mask[i];
if (M == SM_SentinelUndef)
continue;
assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
PermuteImm |= (M & 1) << i;
}
return true;
}
// We need a repeating shuffle mask for VPERMILPS/PSHUFD.
SmallVector<int, 4> RepeatedMask;
if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
return false;
// Narrow the repeated mask for 32-bit element permutes.
SmallVector<int, 4> WordMask = RepeatedMask;
if (MaskScalarSizeInBits == 64) {
WordMask.clear();
for (int M : RepeatedMask) {
if (M == SM_SentinelUndef) {
WordMask.append(2, SM_SentinelUndef);
continue;
}
WordMask.push_back((M * 2) + 0);
WordMask.push_back((M * 2) + 1);
}
}
Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
PermuteImm = getV4X86ShuffleImm(WordMask);
return true;
}
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
@ -24708,7 +24790,7 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
// Attempt to match the mask against known shuffle patterns.
MVT ShuffleVT;
unsigned Shuffle;
unsigned Shuffle, PermuteImm;
if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
@ -24722,6 +24804,20 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
return true;
}
if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, Input);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getConstant(PermuteImm, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
/*AddTo*/ true);
return true;
}
if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return false; // Nothing to do!

View File

@ -6,7 +6,7 @@ define void @endless_loop() {
; CHECK-NEXT: vmovaps (%eax), %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; CHECK-NEXT: vxorps %ymm2, %ymm2, %ymm2
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]

View File

@ -10,9 +10,9 @@ define void @func() nounwind ssp {
; CHECK-NEXT: vmovups 0, %xmm0
; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; CHECK-NEXT: vpbroadcastd 32, %xmm3
; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
; CHECK-NEXT: vbroadcastss 32, %xmm3
; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; CHECK-NEXT: vmulps %ymm0, %ymm2, %ymm2
; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0

View File

@ -2385,14 +2385,14 @@ define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
; X32-LABEL: test_mm256_set1_epi32:
; X32: # BB#0:
; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi32:
; X64: # BB#0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
@ -2422,7 +2422,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
; X64-LABEL: test_mm256_set1_epi64x:
; X64: # BB#0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <4 x i64> undef, i64 %a0, i32 0

View File

@ -4036,7 +4036,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vpermilpd LCPI227_0, %ymm0, %ymm0
; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
ret <4 x double> %res

View File

@ -28,7 +28,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcC:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmovq %rdi, %xmm0
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:

View File

@ -173,13 +173,13 @@ define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtabl
; X32-LABEL: load_splat_8i32_4i32_33333333:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
; X32-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,3,3,3]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
; X64: ## BB#0: ## %entry
; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,3,3,3]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@ -277,15 +277,13 @@ define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable re
; X32-LABEL: load_splat_4i64_2i64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0
; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; X32-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
; X64: ## BB#0: ## %entry
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:

View File

@ -760,7 +760,7 @@ define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwt
; AVX: # BB#0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;

View File

@ -39,9 +39,9 @@ define <16 x i8> @test3(<16 x i8> %V) {
define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
; CHECK-LABEL: test4:
; CHECK: # BB#0:
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1084818905618843912,506097522914230528]
; CHECK-NEXT: movdqa %xmm1, (%rdi)
; CHECK-NEXT: pshufb %xmm1, %xmm0
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1084818905618843912,506097522914230528]
; CHECK-NEXT: movaps %xmm1, (%rdi)
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: retq
%1 = insertelement <2 x i64> undef, i64 1084818905618843912, i32 0
%2 = insertelement <2 x i64> %1, i64 506097522914230528, i32 1

View File

@ -207,7 +207,7 @@ define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X64: ## BB#0: ## %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
@ -220,7 +220,7 @@ define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X64: ## BB#0: ## %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >

View File

@ -1,4 +1,3 @@
; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
@ -147,7 +146,7 @@ define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind {
; AVX2-LABEL: test_cmp_v4f64:
; AVX2: # BB#0:
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@ -684,10 +683,10 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
; AVX2-LABEL: test_cmp_v8f64:
; AVX2: # BB#0:
; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
@ -2143,10 +2142,10 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
; AVX2-LABEL: test_cmp_v16f64:
; AVX2: # BB#0:
; AVX2-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3]
; AVX2-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
@ -2155,10 +2154,10 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0

View File

@ -925,7 +925,7 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:

View File

@ -159,7 +159,7 @@ define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_11:
; AVX: # BB#0:
; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
@ -217,7 +217,7 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_33:
; AVX: # BB#0:
; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1]
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
ret <2 x double> %shuffle

View File

@ -227,7 +227,7 @@ define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: shuffle_v4f32_0011:
; AVX: # BB#0:
; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
ret <4 x float> %shuffle
@ -240,7 +240,7 @@ define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: shuffle_v4f32_2233:
; AVX: # BB#0:
; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
ret <4 x float> %shuffle
@ -1952,30 +1952,16 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
; SSE3: # BB#0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
; SSSE3: # BB#0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
; SSE41: # BB#0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE41-NEXT: retq
; SSE-LABEL: broadcast_v4f32_0101_from_v2f32:
; SSE: # BB#0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
; AVX: # BB#0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX-NEXT: retq
%1 = load <2 x float>, <2 x float>* %x, align 1
%2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>

View File

@ -1443,7 +1443,7 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:

View File

@ -488,7 +488,7 @@ define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_11uu:
; ALL: # BB#0:
; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x double> %shuffle
@ -557,7 +557,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0001:
; AVX1: # BB#0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -579,7 +579,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@ -641,8 +641,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1000:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -800,7 +800,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: retq
@ -1320,7 +1320,7 @@ define <4 x double> @splat_v4f64(<2 x double> %r) {
define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
; AVX1-LABEL: splat_mem_v4i64_from_v2i64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@ -1397,7 +1397,7 @@ define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;

View File

@ -152,8 +152,8 @@ define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00112233:
; AVX1: # BB#0:
; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -196,7 +196,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08080808:
; AVX1: # BB#0:
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@ -213,7 +213,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_08084c4c:
; ALL: # BB#0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x float> %shuffle
@ -907,8 +907,8 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000010:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -924,8 +924,8 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000200:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -941,8 +941,8 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00003000:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -1042,8 +1042,8 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00112233:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -1059,8 +1059,8 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00001111:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@ -1091,7 +1091,7 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08080808:
; AVX1: # BB#0:
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@ -1108,7 +1108,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08084c4c:
; AVX1: # BB#0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08084c4c:
@ -1239,8 +1239,8 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_091b2d3f:
; AVX1: # BB#0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
@ -1257,7 +1257,7 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_09ab1def:
; AVX1: # BB#0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
@ -2050,7 +2050,7 @@ define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_z0U2zUz6:
@ -2066,7 +2066,7 @@ define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_1U3z5zUU:

View File

@ -67,7 +67,7 @@ define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_unpckh:
; ALL: # BB#0:
; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; ALL-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
ret <4 x float> %1
@ -76,7 +76,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_unpckl:
; ALL: # BB#0:
; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; ALL-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
ret <4 x float> %1
@ -167,7 +167,7 @@ define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
; ALL-LABEL: combine_vpermilvar_4f32_4stage:
; ALL: # BB#0:
; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,0,1,2,3,12,13,14,15,4,5,6,7]
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
; ALL-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
%2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
@ -177,24 +177,10 @@ define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
}
define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
; AVX1-LABEL: combine_vpermilvar_8f32_4stage:
; AVX1: # BB#0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vpermilvar_8f32_4stage:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,0,1,2,3,12,13,14,15,4,5,6,7,24,25,26,27,16,17,18,19,28,29,30,31,20,21,22,23]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_vpermilvar_8f32_4stage:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,0,1,2,3,12,13,14,15,4,5,6,7,24,25,26,27,16,17,18,19,28,29,30,31,20,21,22,23]
; AVX512F-NEXT: retq
; ALL-LABEL: combine_vpermilvar_8f32_4stage:
; ALL: # BB#0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; ALL-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
%3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)

View File

@ -51,7 +51,7 @@ define <4 x i64> @combine_permq_pshufb(<4 x i64> %a0) {
; CHECK-LABEL: combine_permq_pshufb:
; CHECK: # BB#0:
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%2 = bitcast <4 x i64> %1 to <32 x i8>

View File

@ -212,6 +212,43 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *
ret <16 x float> %res0
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
; CHECK: # BB#0:
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_load:
; CHECK: # BB#0:
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
ret <16 x float> %res0
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
; CHECK: # BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
; CHECK: # BB#0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0
}
define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
; CHECK-LABEL: combine_vpermt2var_16i32_identity:
; CHECK: # BB#0:

View File

@ -94,13 +94,12 @@ define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: combine_pshufb_palignr:
; SSE: # BB#0:
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_palignr:
; AVX: # BB#0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
%2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)

View File

@ -2440,7 +2440,7 @@ define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
;
; AVX-LABEL: combine_undef_input_test9:
; AVX: # BB#0:
; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@ -2631,7 +2631,7 @@ define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
;
; AVX-LABEL: combine_undef_input_test19:
; AVX: # BB#0:
; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>