mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 02:33:06 +01:00
[X86][AVX] Prefer vinsertf128 to vperm2f128 on AVX1 targets
Splatting the lower xmm with vinsertf128 is at least as quick as vperm2f128, and a lot faster on some AMD targets. First step towards PR50053
This commit is contained in:
parent
f76b3abd62
commit
96fce2fe63
@ -35906,6 +35906,17 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
||||
DL, 256);
|
||||
}
|
||||
|
||||
// If we're splatting the low subvector, an insert-subvector 'concat'
|
||||
// pattern is quicker than VPERM2X128.
|
||||
// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
|
||||
if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
|
||||
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
|
||||
return SDValue(); // Nothing to do!
|
||||
Res = CanonicalizeShuffleInput(RootVT, V1);
|
||||
Res = extractSubVector(Res, 0, DAG, DL, 128);
|
||||
return concatSubVectors(Res, Res, DAG, DL);
|
||||
}
|
||||
|
||||
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
|
||||
return SDValue(); // Nothing to do!
|
||||
|
||||
@ -39039,6 +39050,22 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case X86ISD::VPERM2X128: {
|
||||
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
|
||||
SDLoc DL(Op);
|
||||
unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
|
||||
if (LoMask & 0x8)
|
||||
return TLO.CombineTo(
|
||||
Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
|
||||
unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
|
||||
unsigned SrcIdx = (LoMask & 0x2) >> 1;
|
||||
SDValue ExtOp =
|
||||
extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
|
||||
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
|
||||
SDValue Insert =
|
||||
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
|
||||
return TLO.CombineTo(Op, Insert);
|
||||
}
|
||||
// Zero upper elements.
|
||||
case X86ISD::VZEXT_MOVL:
|
||||
@ -50438,6 +50465,19 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
|
||||
MemIntr->getMemoryVT(), MemIntr, 0, DAG);
|
||||
}
|
||||
|
||||
// If we're splatting the lower half subvector of a full vector load into the
|
||||
// upper half, attempt to create a subvector broadcast.
|
||||
if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
|
||||
Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
|
||||
auto *VecLd = dyn_cast<LoadSDNode>(Vec);
|
||||
auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
|
||||
if (VecLd && SubLd &&
|
||||
DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
|
||||
SubVec.getValueSizeInBits() / 8, 0))
|
||||
return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
|
||||
SubLd, 0, DAG);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
@ -1160,23 +1160,23 @@ define void @indices_convert() {
|
||||
;
|
||||
; XOP-LABEL: indices_convert:
|
||||
; XOP: # %bb.0: # %bb
|
||||
; XOP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
|
||||
; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
|
||||
; XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; XOP-NEXT: vmovapd (%rax), %xmm1
|
||||
; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0
|
||||
; XOP-NEXT: vpermil2pd $0, %xmm0, %xmm1, %xmm1, %xmm0
|
||||
; XOP-NEXT: vmovdqa (%rax), %xmm0
|
||||
; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
|
||||
; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
||||
; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1
|
||||
; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm0, %xmm0, %xmm0
|
||||
; XOP-NEXT: vmovupd %xmm0, (%rax)
|
||||
; XOP-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: indices_convert:
|
||||
; AVX1: # %bb.0: # %bb
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
|
||||
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vmovapd (%rax), %xmm1
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpermilpd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vmovdqa (%rax), %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
|
||||
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
||||
; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovupd %xmm0, (%rax)
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -294,11 +294,10 @@ define void @splat2_i64(<4 x i64>* %s, <8 x i64>* %d) {
|
||||
;
|
||||
; AVX1-LABEL: splat2_i64:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovupd (%rdi), %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
|
||||
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
|
||||
; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
|
||||
; AVX1-NEXT: vmovupd %ymm1, (%rsi)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
|
@ -166,10 +166,10 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
|
||||
define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_00040000:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[2,0],ymm0[4,4],ymm1[6,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2OR512VL-LABEL: shuffle_v8f32_00040000:
|
||||
@ -835,9 +835,9 @@ define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
|
||||
define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
|
||||
; AVX1-LABEL: shuffle_v8f32_c348cda0:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1,0,1]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
|
||||
@ -1667,10 +1667,10 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
|
||||
define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
|
||||
; AVX1-LABEL: shuffle_v8i32_00040000:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[2,0],ymm0[4,4],ymm1[6,4]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
|
||||
|
@ -1646,11 +1646,10 @@ ret void
|
||||
define void @splat2_v4f64_load_store(<4 x double>* %s, <8 x double>* %d) {
|
||||
; AVX1-LABEL: splat2_v4f64_load_store:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovupd (%rdi), %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
|
||||
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
|
||||
; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
|
||||
; AVX1-NEXT: vmovupd %ymm1, (%rsi)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
@ -1684,11 +1683,10 @@ define void @splat2_v4f64_load_store(<4 x double>* %s, <8 x double>* %d) {
|
||||
define void @splat2_v4i64_load_store(<4 x i64>* %s, <8 x i64>* %d) {
|
||||
; AVX1-LABEL: splat2_v4i64_load_store:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovupd (%rdi), %ymm0
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
|
||||
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
|
||||
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
|
||||
; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
|
||||
; AVX1-NEXT: vmovupd %ymm1, (%rsi)
|
||||
; AVX1-NEXT: vzeroupper
|
||||
|
Loading…
Reference in New Issue
Block a user