[PowerPC] Exploit single instruction load-and-splat for word and doubleword

We currently produce a load, followed by (possibly a move for integers and) a splat as separate instructions. VSX has always had a splatting load for doublewords, but as of Power9, we have it for words as well. This patch just exploits these instructions. Differential revision: https://reviews.llvm.org/D63624 llvm-svn: 372139
2025-01-31 12:41:49 +01:00 · 2019-09-17 16:45:20 +00:00 · 2019-09-17 16:45:20 +00:00 · 416a79594b
commit 416a79594b
parent d100299e59
10 changed files with 399 additions and 59 deletions
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -1406,6 +1406,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
  case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
  case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
+  case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
  }
  return nullptr;
 }
@ -1778,10 +1779,10 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,

 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a splat of a single element that is suitable for input to
-/// VSPLTB/VSPLTH/VSPLTW.
+/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
-  assert(N->getValueType(0) == MVT::v16i8 &&
-         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+  assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) &&
+         EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");

  // The consecutive indices need to specify an element, not part of two
  // different elements.  So abandon ship early if this isn't the case.
@ -2074,10 +2075,11 @@ bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
 }


-/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
-/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
-                                SelectionDAG &DAG) {
+/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+/// appropriate for PPC mnemonics (which have a big endian bias - namely
+/// elements are counted from the left of the vector register).
+unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+                                         SelectionDAG &DAG) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  assert(isSplatShuffleMask(SVOp, EltSize));
  if (DAG.getDataLayout().isLittleEndian())
@ -8185,6 +8187,18 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
                     Op0.getOperand(1));
 }

+const SDValue *getNormalLoadInput(const SDValue &Op) {
+  const SDValue *InputLoad = &Op;
+  if (InputLoad->getOpcode() == ISD::BITCAST)
+    InputLoad = &InputLoad->getOperand(0);
+  if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
+    InputLoad = &InputLoad->getOperand(0);
+  if (InputLoad->getOpcode() != ISD::LOAD)
+    return nullptr;
+  LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+  return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@ -8307,6 +8321,34 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
                             HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
      SplatBitSize > 32) {
+
+    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+    // Handle load-and-splat patterns as we have instructions that will do this
+    // in one go.
+    if (InputLoad && DAG.isSplatValue(Op, true)) {
+      LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+
+      // We have handling for 4 and 8 byte elements.
+      unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+
+      // Checking for a single use of this load, we have to check for vector
+      // width (128 bits) / ElementSize uses (since each operand of the
+      // BUILD_VECTOR is a separate use of the value.
+      if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) &&
+          ((Subtarget.hasVSX() && ElementSize == 64) ||
+           (Subtarget.hasP9Vector() && ElementSize == 32))) {
+        SDValue Ops[] = {
+          LD->getChain(),    // Chain
+          LD->getBasePtr(),  // Ptr
+          DAG.getValueType(Op.getValueType()) // VT
+        };
+        return
+          DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl,
+                                  DAG.getVTList(Op.getValueType(), MVT::Other),
+                                  Ops, LD->getMemoryVT(), LD->getMemOperand());
+      }
+    }
+
    // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be
    // lowered to VSX instructions under certain conditions.
    // Without VSX, there is no pattern more efficient than expanding the node.
@ -8792,6 +8834,45 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

  unsigned ShiftElts, InsertAtByte;
  bool Swap = false;
+
+  // If this is a load-and-splat, we can do that with a single instruction
+  // in some cases. However if the load has multiple uses, we don't want to
+  // combine it because that will just produce multiple loads.
+  const SDValue *InputLoad = getNormalLoadInput(V1);
+  if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
+      (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
+      InputLoad->hasOneUse()) {
+    bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
+    int SplatIdx =
+      PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
+
+    LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
+    // For 4-byte load-and-splat, we need Power9.
+    if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
+      uint64_t Offset = 0;
+      if (IsFourByte)
+        Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
+      else
+        Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+      SDValue BasePtr = LD->getBasePtr();
+      if (Offset != 0)
+        BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
+                              BasePtr, DAG.getIntPtrConstant(Offset, dl));
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        BasePtr,           // BasePtr
+        DAG.getValueType(Op.getValueType()) // VT
+      };
+      SDVTList VTL =
+        DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
+      SDValue LdSplt =
+        DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
+      if (LdSplt.getValueType() != SVOp->getValueType(0))
+        LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
+      return LdSplt;
+    }
+  }
  if (Subtarget.hasP9Vector() &&
      PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
                           isLittleEndian)) {
@ -8868,7 +8949,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

  if (Subtarget.hasVSX()) {
    if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
-      int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+      int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);

      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
      SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@ -466,6 +466,10 @@ namespace llvm {
      /// v2f32 value into the lower half of a VSR register.
      LD_VSX_LH,

+      /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+      /// instructions such as LXVDSX, LXVWSX.
+      LD_SPLAT,
+
      /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
      /// Maps directly to an stxvd2x instruction that will be preceded by
      /// an xxswapd.
@ -574,9 +578,11 @@ namespace llvm {
    bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
                         unsigned &InsertAtByte, bool &Swap, bool IsLE);

-    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
-    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
-    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG);
+    /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
+    /// appropriate for PPC mnemonics (which have a big endian bias - namely
+    /// elements are counted from the left of the vector register).
+    unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
+                                        SelectionDAG &DAG);

    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
    /// formed by using a vspltis[bhw] instruction of the specified element
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@ -215,21 +215,21 @@ def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),

 // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
 def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG), SDLoc(N));
+  return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 1, *CurDAG), SDLoc(N));
 }]>;
 def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                             (vector_shuffle node:$lhs, node:$rhs), [{
  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
 }], VSPLTB_get_imm>;
 def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG), SDLoc(N));
+  return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 2, *CurDAG), SDLoc(N));
 }]>;
 def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                             (vector_shuffle node:$lhs, node:$rhs), [{
  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
 }], VSPLTH_get_imm>;
 def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG), SDLoc(N));
+  return getI32Imm(PPC::getSplatIdxForPPCMnemonics(N, 4, *CurDAG), SDLoc(N));
 }]>;
 def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                             (vector_shuffle node:$lhs, node:$rhs), [{
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@ -62,6 +62,10 @@ def SDT_PPCfpexth : SDTypeProfile<1, 2, [
  SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32>, SDTCisPtrTy<2>
 ]>;

+def SDT_PPCldsplat : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisPtrTy<1>
+]>;
+
 // Little-endian-specific nodes.
 def SDT_PPClxvd2x : SDTypeProfile<1, 1, [
  SDTCisVT<0, v2f64>, SDTCisPtrTy<1>
@ -105,6 +109,8 @@ def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
 def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
 def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;

 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
                    string asmstr, InstrItinClass itin, Intrinsic Int,
@ -3931,6 +3937,10 @@ let AddedComplexity = 400 in {
                                (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
    def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
              (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+    def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
+              (v2f64 (LXVDSX xoaddr:$A))>;
+    def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
+              (v2i64 (LXVDSX xoaddr:$A))>;

    // Build vectors of floating point converted to i64.
    def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
@ -4180,6 +4190,10 @@ let AddedComplexity = 400 in {
              (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
                                              (DFLOADf32 iaddrX4:$A),
                                              VSFRC)), 0))>;
+    def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
+              (v4f32 (LXVWSX xoaddr:$A))>;
+    def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
+              (v4i32 (LXVWSX xoaddr:$A))>;
  }

  let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
--- a/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll
+++ b/test/CodeGen/PowerPC/VSX-XForm-Scalars.ll
@ -27,18 +27,16 @@ define void @testExpandPostRAPseudo(i32* nocapture readonly %ptr) {
 ;
 ; CHECK-P9-LABEL: testExpandPostRAPseudo:
 ; CHECK-P9:  # %bb.0: # %entry
-; CHECK-P9:    lfiwzx f0, 0, r3
 ; CHECK-P9:    addis r4, r2, .LC0@toc@ha
+; CHECK-P9:    lxvwsx vs0, 0, r3
 ; CHECK-P9:    ld r4, .LC0@toc@l(r4)
-; CHECK-P9:    xxpermdi vs0, f0, f0, 2
-; CHECK-P9:    xxspltw vs0, vs0, 3
 ; CHECK-P9:    stxvx vs0, 0, r4
+; CHECK-P9:    lis r4, 1024
 ; CHECK-P9:    lfiwax f0, 0, r3
 ; CHECK-P9:    addis r3, r2, .LC1@toc@ha
 ; CHECK-P9:    ld r3, .LC1@toc@l(r3)
 ; CHECK-P9:    xscvsxdsp f0, f0
 ; CHECK-P9:    ld r3, 0(r3)
-; CHECK-P9:    lis r4, 1024
 ; CHECK-P9:    stfsx f0, r3, r4
 ; CHECK-P9:    blr
 entry:
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@ -1278,16 +1278,12 @@ entry:
 define <4 x i32> @spltMemVali(i32* nocapture readonly %ptr) {
 ; P9BE-LABEL: spltMemVali:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    lfiwzx f0, 0, r3
-; P9BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; P9BE-NEXT:    xxspltw v2, vs0, 0
+; P9BE-NEXT:    lxvwsx v2, 0, r3
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltMemVali:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    lfiwzx f0, 0, r3
-; P9LE-NEXT:    xxpermdi vs0, f0, f0, 2
-; P9LE-NEXT:    xxspltw v2, vs0, 3
+; P9LE-NEXT:    lxvwsx v2, 0, r3
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltMemVali:
@ -2833,16 +2829,12 @@ entry:
 define <4 x i32> @spltMemValui(i32* nocapture readonly %ptr) {
 ; P9BE-LABEL: spltMemValui:
 ; P9BE:       # %bb.0: # %entry
-; P9BE-NEXT:    lfiwzx f0, 0, r3
-; P9BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; P9BE-NEXT:    xxspltw v2, vs0, 0
+; P9BE-NEXT:    lxvwsx v2, 0, r3
 ; P9BE-NEXT:    blr
 ;
 ; P9LE-LABEL: spltMemValui:
 ; P9LE:       # %bb.0: # %entry
-; P9LE-NEXT:    lfiwzx f0, 0, r3
-; P9LE-NEXT:    xxpermdi vs0, f0, f0, 2
-; P9LE-NEXT:    xxspltw v2, vs0, 3
+; P9LE-NEXT:    lxvwsx v2, 0, r3
 ; P9LE-NEXT:    blr
 ;
 ; P8BE-LABEL: spltMemValui:
--- a/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/test/CodeGen/PowerPC/load-and-splat.ll
@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \
+; RUN:   -check-prefix=P9
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
+; RUN:   -check-prefix=P8
+define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readonly %a) local_unnamed_addr {
+; P9-LABEL: test:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r4, r4, 24
+; P9-NEXT:    lxvdsx vs0, 0, r4
+; P9-NEXT:    stxv vs0, 0(r3)
+; P9-NEXT:    blr
+;
+; P8-LABEL: test:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    addi r4, r4, 24
+; P8-NEXT:    lxvdsx vs0, 0, r4
+; P8-NEXT:    stxvd2x vs0, 0, r3
+; P8-NEXT:    blr
+entry:
+  %arrayidx = getelementptr inbounds double, double* %a, i64 3
+  %0 = load double, double* %arrayidx, align 8
+  %splat.splatinsert.i = insertelement <2 x double> undef, double %0, i32 0
+  %splat.splat.i = shufflevector <2 x double> %splat.splatinsert.i, <2 x double> undef, <2 x i32> zeroinitializer
+  store <2 x double> %splat.splat.i, <2 x double>* %c, align 16
+  ret void
+}
+
+define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonly %a) local_unnamed_addr {
+; P9-LABEL: test2:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r4, r4, 12
+; P9-NEXT:    lxvwsx vs0, 0, r4
+; P9-NEXT:    stxv vs0, 0(r3)
+; P9-NEXT:    blr
+;
+; P8-LABEL: test2:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    addi r4, r4, 12
+; P8-NEXT:    lfiwzx f0, 0, r4
+; P8-NEXT:    xxpermdi vs0, f0, f0, 2
+; P8-NEXT:    xxspltw v2, vs0, 3
+; P8-NEXT:    stvx v2, 0, r3
+; P8-NEXT:    blr
+entry:
+  %arrayidx = getelementptr inbounds float, float* %a, i64 3
+  %0 = load float, float* %arrayidx, align 4
+  %splat.splatinsert.i = insertelement <4 x float> undef, float %0, i32 0
+  %splat.splat.i = shufflevector <4 x float> %splat.splatinsert.i, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %splat.splat.i, <4 x float>* %c, align 16
+  ret void
+}
+
+define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a) local_unnamed_addr {
+; P9-LABEL: test3:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r4, r4, 12
+; P9-NEXT:    lxvwsx vs0, 0, r4
+; P9-NEXT:    stxv vs0, 0(r3)
+; P9-NEXT:    blr
+;
+; P8-LABEL: test3:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    addi r4, r4, 12
+; P8-NEXT:    lfiwzx f0, 0, r4
+; P8-NEXT:    xxpermdi vs0, f0, f0, 2
+; P8-NEXT:    xxspltw v2, vs0, 3
+; P8-NEXT:    stvx v2, 0, r3
+; P8-NEXT:    blr
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  store <4 x i32> %splat.splat.i, <4 x i32>* %c, align 16
+  ret void
+}
+
+define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a) local_unnamed_addr {
+; P9-LABEL: test4:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r4, r4, 24
+; P9-NEXT:    lxvdsx vs0, 0, r4
+; P9-NEXT:    stxv vs0, 0(r3)
+; P9-NEXT:    blr
+;
+; P8-LABEL: test4:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    addi r4, r4, 24
+; P8-NEXT:    lxvdsx vs0, 0, r4
+; P8-NEXT:    stxvd2x vs0, 0, r3
+; P8-NEXT:    blr
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 3
+  %0 = load i64, i64* %arrayidx, align 8
+  %splat.splatinsert.i = insertelement <2 x i64> undef, i64 %0, i32 0
+  %splat.splat.i = shufflevector <2 x i64> %splat.splatinsert.i, <2 x i64> undef, <2 x i32> zeroinitializer
+  store <2 x i64> %splat.splat.i, <2 x i64>* %c, align 16
+  ret void
+}
+
+define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) {
+; P9-LABEL: unadjusted_lxvwsx:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    lxvwsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: unadjusted_lxvwsx:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lfiwzx f0, 0, r3
+; P8-NEXT:    xxpermdi vs0, f0, f0, 2
+; P8-NEXT:    xxspltw v2, vs0, 3
+; P8-NEXT:    blr
+  entry:
+    %0 = bitcast i32* %s to <4 x i8>*
+    %1 = load <4 x i8>, <4 x i8>* %0, align 4
+    %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+    ret <16 x i8> %2
+}
+
+define <16 x i8> @adjusted_lxvwsx(i64* %s, i64* %t) {
+; P9-LABEL: adjusted_lxvwsx:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r3, r3, 4
+; P9-NEXT:    lxvwsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: adjusted_lxvwsx:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    ld r3, 0(r3)
+; P8-NEXT:    mtvsrd f0, r3
+; P8-NEXT:    xxswapd v2, vs0
+; P8-NEXT:    xxspltw v2, v2, 2
+; P8-NEXT:    blr
+  entry:
+    %0 = bitcast i64* %s to <8 x i8>*
+    %1 = load <8 x i8>, <8 x i8>* %0, align 8
+    %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+    ret <16 x i8> %2
+}
+
+define <16 x i8> @unadjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
+; P9-LABEL: unadjusted_lxvwsx_v16i8:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    lxvwsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: unadjusted_lxvwsx_v16i8:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lvx v2, 0, r3
+; P8-NEXT:    xxspltw v2, v2, 3
+; P8-NEXT:    blr
+  entry:
+    %0 = load <16 x i8>, <16 x i8>* %s, align 16
+    %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+    ret <16 x i8> %1
+}
+
+define <16 x i8> @adjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
+; P9-LABEL: adjusted_lxvwsx_v16i8:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r3, r3, 4
+; P9-NEXT:    lxvwsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: adjusted_lxvwsx_v16i8:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lvx v2, 0, r3
+; P8-NEXT:    xxspltw v2, v2, 2
+; P8-NEXT:    blr
+  entry:
+    %0 = load <16 x i8>, <16 x i8>* %s, align 16
+    %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+    ret <16 x i8> %1
+}
+
+define <16 x i8> @adjusted_lxvwsx_v16i8_2(<16 x i8> *%s, <16 x i8> %t) {
+; P9-LABEL: adjusted_lxvwsx_v16i8_2:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r3, r3, 8
+; P9-NEXT:    lxvwsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: adjusted_lxvwsx_v16i8_2:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lvx v2, 0, r3
+; P8-NEXT:    xxspltw v2, v2, 1
+; P8-NEXT:    blr
+  entry:
+    %0 = load <16 x i8>, <16 x i8>* %s, align 16
+    %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
+    ret <16 x i8> %1
+}
+
+define <16 x i8> @adjusted_lxvwsx_v16i8_3(<16 x i8> *%s, <16 x i8> %t) {
+; P9-LABEL: adjusted_lxvwsx_v16i8_3:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r3, r3, 12
+; P9-NEXT:    lxvwsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: adjusted_lxvwsx_v16i8_3:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lvx v2, 0, r3
+; P8-NEXT:    xxspltw v2, v2, 0
+; P8-NEXT:    blr
+  entry:
+    %0 = load <16 x i8>, <16 x i8>* %s, align 16
+    %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15, i32 12, i32 13, i32 14, i32 15>
+    ret <16 x i8> %1
+}
+
+define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) {
+; P9-LABEL: unadjusted_lxvdsx:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    lxvdsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: unadjusted_lxvdsx:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lxvdsx v2, 0, r3
+; P8-NEXT:    blr
+  entry:
+    %0 = bitcast i64* %s to <8 x i8>*
+    %1 = load <8 x i8>, <8 x i8>* %0, align 8
+    %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    ret <16 x i8> %2
+}
+
+define <16 x i8> @unadjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
+; P9-LABEL: unadjusted_lxvdsx_v16i8:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    lxvdsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: unadjusted_lxvdsx_v16i8:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lxvdsx v2, 0, r3
+; P8-NEXT:    blr
+  entry:
+    %0 = load <16 x i8>, <16 x i8>* %s, align 16
+    %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    ret <16 x i8> %1
+}
+
+define <16 x i8> @adjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) {
+; P9-LABEL: adjusted_lxvdsx_v16i8:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    addi r3, r3, 8
+; P9-NEXT:    lxvdsx v2, 0, r3
+; P9-NEXT:    blr
+;
+; P8-LABEL: adjusted_lxvdsx_v16i8:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    addi r3, r3, 8
+; P8-NEXT:    lxvdsx v2, 0, r3
+; P8-NEXT:    blr
+  entry:
+    %0 = load <16 x i8>, <16 x i8>* %s, align 16
+    %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+    ret <16 x i8> %1
+}
--- a/test/CodeGen/PowerPC/power9-moves-and-splats.ll
+++ b/test/CodeGen/PowerPC/power9-moves-and-splats.ll
@ -61,16 +61,12 @@ entry:
 define <4 x i32> @test4(i32* nocapture readonly %in) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test4:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr

 entry:
@ -83,16 +79,12 @@ entry:
 define <4 x float> @test5(float* nocapture readonly %in) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test5:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr

 entry:
@ -107,18 +99,14 @@ define <4 x i32> @test6() {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r3, r2, .LC0@toc@ha
 ; CHECK-NEXT:    ld r3, .LC0@toc@l(r3)
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test6:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r3, r2, .LC0@toc@ha
 ; CHECK-BE-NEXT:    ld r3, .LC0@toc@l(r3)
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr

 entry:
@ -133,18 +121,14 @@ define <4 x float> @test7() {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis r3, r2, .LC1@toc@ha
 ; CHECK-NEXT:    ld r3, .LC1@toc@l(r3)
-; CHECK-NEXT:    lfiwzx f0, 0, r3
-; CHECK-NEXT:    xxpermdi vs0, f0, f0, 2
-; CHECK-NEXT:    xxspltw v2, vs0, 3
+; CHECK-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: test7:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    addis r3, r2, .LC1@toc@ha
 ; CHECK-BE-NEXT:    ld r3, .LC1@toc@l(r3)
-; CHECK-BE-NEXT:    lfiwzx f0, 0, r3
-; CHECK-BE-NEXT:    xxsldwi vs0, f0, f0, 1
-; CHECK-BE-NEXT:    xxspltw v2, vs0, 0
+; CHECK-BE-NEXT:    lxvwsx v2, 0, r3
 ; CHECK-BE-NEXT:    blr

 entry:
--- a/test/CodeGen/PowerPC/qpx-load-splat.ll
+++ b/test/CodeGen/PowerPC/qpx-load-splat.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s

@ -34,9 +35,9 @@ define <4 x double> @fooxu(double* nocapture readonly %a, i64 %idx, double** %pp
 ; CHECK-LABEL: fooxu:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    sldi r4, r4, 3
-; CHECK-NEXT:    lfdux f0, r3, r4
-; CHECK-NEXT:    xxspltd v2, vs0, 0
-; CHECK-NEXT:    std r3, 0(r5)
+; CHECK-NEXT:    add r6, r3, r4
+; CHECK-NEXT:    std r6, 0(r5)
+; CHECK-NEXT:    lxvdsx v2, r3, r4
 ; CHECK-NEXT:    vmr v3, v2
 ; CHECK-NEXT:    blr
 entry:
--- a/test/CodeGen/PowerPC/swaps-le-7.ll
+++ b/test/CodeGen/PowerPC/swaps-le-7.ll
@ -9,8 +9,8 @@
@G4 = global <2 x double> <double 7.0, double 8.0>

 ; CHECK-LABEL: @zg
-; CHECK: xxspltd
-; CHECK-NEXT: xxspltd
+; CHECK: lxvdsx
+; CHECK-NEXT: lxvdsx
 ; CHECK-NEXT: xvmuldp
 ; CHECK-DAG: xvmuldp
 ; CHECK-DAG: xvsubdp