[Power9] Part-word VSX integer scalar loads/stores and sign extend instructions

This patch corresponds to review: https://reviews.llvm.org/D23155 This patch removes the VSHRC register class (based on D20310) and adds exploitation of the Power9 sub-word integer loads into VSX registers as well as vector sign extensions. The new instructions are useful for a few purposes: Int to Fp conversions of 1 or 2-byte values loaded from memory Building vectors of 1 or 2-byte integers with values loaded from memory Storing individual 1 or 2-byte elements from integer vectors This patch implements all of those uses. llvm-svn: 283190
2025-01-31 20:51:52 +01:00 · 2016-10-04 06:59:23 +00:00 · 2016-10-04 06:59:23 +00:00 · fe9adb9248
commit fe9adb9248
parent fb59389817
33 changed files with 1889 additions and 395 deletions
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@ -83,6 +83,16 @@ static const MCPhysReg FRegs[32] = {
  PPC::F24, PPC::F25, PPC::F26, PPC::F27,
  PPC::F28, PPC::F29, PPC::F30, PPC::F31
 };
+static const MCPhysReg VFRegs[32] = {
+  PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
+  PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
+  PPC::VF8,  PPC::VF9,  PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
 static const MCPhysReg VRegs[32] = {
  PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
  PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
@ -103,14 +113,14 @@ static const MCPhysReg VSRegs[64] = {
  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,

-  PPC::VSH0,  PPC::VSH1,  PPC::VSH2,  PPC::VSH3,
-  PPC::VSH4,  PPC::VSH5,  PPC::VSH6,  PPC::VSH7,
-  PPC::VSH8,  PPC::VSH9,  PPC::VSH10, PPC::VSH11,
-  PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15,
-  PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19,
-  PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23,
-  PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
-  PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
+  PPC::V0,  PPC::V1,  PPC::V2,  PPC::V3,
+  PPC::V4,  PPC::V5,  PPC::V6,  PPC::V7,
+  PPC::V8,  PPC::V9,  PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };
 static const MCPhysReg VSFRegs[64] = {
  PPC::F0,  PPC::F1,  PPC::F2,  PPC::F3,
@ -597,6 +607,11 @@ public:
    Inst.addOperand(MCOperand::createReg(FRegs[getReg()]));
  }

+  void addRegVFRCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(VFRegs[getReg()]));
+  }
+
  void addRegVRRCOperands(MCInst &Inst, unsigned N) const {
    assert(N == 1 && "Invalid number of operands!");
    Inst.addOperand(MCOperand::createReg(VRegs[getReg()]));
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@ -89,6 +89,17 @@ static const unsigned FRegs[] = {
  PPC::F28, PPC::F29, PPC::F30, PPC::F31
 };

+static const unsigned VFRegs[] = {
+  PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3,
+  PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7,
+  PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11,
+  PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15,
+  PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19,
+  PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23,
+  PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27,
+  PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31
+};
+
 static const unsigned VRegs[] = {
  PPC::V0, PPC::V1, PPC::V2, PPC::V3,
  PPC::V4, PPC::V5, PPC::V6, PPC::V7,
@ -110,14 +121,14 @@ static const unsigned VSRegs[] = {
  PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27,
  PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31,

-  PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3,
-  PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7,
-  PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11,
-  PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15,
-  PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19,
-  PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23,
-  PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27,
-  PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31
+  PPC::V0, PPC::V1, PPC::V2, PPC::V3,
+  PPC::V4, PPC::V5, PPC::V6, PPC::V7,
+  PPC::V8, PPC::V9, PPC::V10, PPC::V11,
+  PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+  PPC::V16, PPC::V17, PPC::V18, PPC::V19,
+  PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+  PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+  PPC::V28, PPC::V29, PPC::V30, PPC::V31
 };

 static const unsigned VSFRegs[] = {
@ -242,6 +253,12 @@ static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
  return decodeRegisterClass(Inst, RegNo, FRegs);
 }

+static DecodeStatus DecodeVFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, VFRegs);
+}
+
 static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo,
                                            uint64_t Address,
                                            const void *Decoder) {
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//

 #include "PPCInstPrinter.h"
+#include "PPCInstrInfo.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/MC/MCExpr.h"
@ -447,7 +448,7 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
 /// stripRegisterPrefix - This method strips the character prefix from a
 /// register name so that only the number is left.  Used by for linux asm.
 static const char *stripRegisterPrefix(const char *RegName) {
-  if (FullRegNames)
+  if (FullRegNames || ShowVSRNumsAsVR)
    return RegName;

  switch (RegName[0]) {
@ -468,15 +469,24 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
  const MCOperand &Op = MI->getOperand(OpNo);
  if (Op.isReg()) {
-    const char *RegName = getRegisterName(Op.getReg());
-    if (ShowVSRNumsAsVR) {
-      unsigned RegNum = Op.getReg();
-      if (RegNum >= PPC::VSH0 && RegNum <= PPC::VSH31)
-        O << 'v' << RegNum - PPC::VSH0;
-      else
-        O << RegName;
-      return;
+    unsigned Reg = Op.getReg();
+
+    // There are VSX instructions that use VSX register numbering (vs0 - vs63)
+    // as well as those that use VMX register numbering (v0 - v31 which
+    // correspond to vs32 - vs63). If we have an instruction that uses VSX
+    // numbering, we need to convert the VMX registers to VSX registers.
+    // Namely, we print 32-63 when the instruction operates on one of the
+    // VMX registers.
+    // (Please synchronize with PPCAsmPrinter::printOperand)
+    if ((MII.get(MI->getOpcode()).TSFlags & PPCII::UseVSXReg) &&
+        !ShowVSRNumsAsVR) {
+      if (PPCInstrInfo::isVRRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::V0);
+      else if (PPCInstrInfo::isVFRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::VF0);
    }
+
+    const char *RegName = getRegisterName(Reg);
    // The linux and AIX assembler does not take register prefixes.
    if (!isDarwinSyntax())
      RegName = stripRegisterPrefix(RegName);
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//

+#include "PPCInstrInfo.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/ADT/Statistic.h"
@ -350,7 +351,6 @@ get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
  return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 }

-
 unsigned PPCMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                  SmallVectorImpl<MCFixup> &Fixups,
@ -361,7 +361,14 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+    unsigned Reg = MO.getReg();
+    unsigned Encode = CTX.getRegisterInfo()->getEncodingValue(Reg);
+
+    if ((MCII.get(MI.getOpcode()).TSFlags & PPCII::UseVSXReg))
+      if (PPCInstrInfo::isVRRegister(Reg))
+        Encode += 32;
+
+    return Encode;
  }
  
  assert(MO.isImm() &&
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@ -167,7 +167,23 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,

  switch (MO.getType()) {
  case MachineOperand::MO_Register: {
-    const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
+    unsigned Reg = MO.getReg();
+
+    // There are VSX instructions that use VSX register numbering (vs0 - vs63)
+    // as well as those that use VMX register numbering (v0 - v31 which
+    // correspond to vs32 - vs63). If we have an instruction that uses VSX
+    // numbering, we need to convert the VMX registers to VSX registers.
+    // Namely, we print 32-63 when the instruction operates on one of the
+    // VMX registers.
+    // (Please synchronize with PPCInstPrinter::printOperand)
+    if (MI->getDesc().TSFlags & PPCII::UseVSXReg) {
+      if (PPCInstrInfo::isVRRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::V0);
+      else if (PPCInstrInfo::isVFRegister(Reg))
+        Reg = PPC::VSX32 + (Reg - PPC::VF0);
+    }
+    const char *RegName = PPCInstPrinter::getRegisterName(Reg);
+
    // Linux assembler (Others?) does not take register mnemonics.
    // FIXME - What about special registers used in mfspr/mtspr?
    if (!Subtarget->isDarwin())
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@ -68,11 +68,9 @@ def RetCC_PPC : CallingConv<[
 
  // Vector types returned as "direct" go into V2 .. V9; note that only the
  // ELFv2 ABI fully utilizes all these registers.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], 
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
           CCIfSubtarget<"hasAltivec()",
-           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
-  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
-           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>>
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
 ]>;

 // No explicit register is specified for the AnyReg calling convention. The
@ -121,11 +119,9 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[
  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
  CCIfType<[v4f64, v4f32, v4i1],
           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>,
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], 
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
           CCIfSubtarget<"hasAltivec()",
-           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>,
-  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
-           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>>
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>
 ]>;

 //===----------------------------------------------------------------------===//
@ -193,12 +189,9 @@ def CC_PPC32_SVR4 : CallingConv<[
    CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>,

  // The first 12 Vector arguments are passed in AltiVec registers.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32], 
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
           CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7,
                          V8, V9, V10, V11, V12, V13]>>>,
-  CCIfType<[v2f64, v2i64], CCIfSubtarget<"hasVSX()",
-           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9,
-                          VSH10, VSH11, VSH12, VSH13]>>>,
           
  CCDelegateTo<CC_PPC32_SVR4_Common>
 ]>;  
@ -287,6 +280,5 @@ def CSR_64_AllRegs_Altivec : CalleeSavedRegs<(add CSR_64_AllRegs,
                                             (sequence "V%u", 0, 31))>;

 def CSR_64_AllRegs_VSX : CalleeSavedRegs<(add CSR_64_AllRegs_Altivec,
-                                         (sequence "VSL%u", 0, 31),
-                                         (sequence "VSH%u", 0, 31))>;
+                                         (sequence "VSL%u", 0, 31))>;

--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -685,7 +685,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
    }

    if (Subtarget.isISA3_0() && Subtarget.hasDirectMove())
-      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Legal);
+      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
  }

  if (Subtarget.hasQPX()) {
@ -1075,6 +1075,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case PPCISD::STBRX:           return "PPCISD::STBRX";
  case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
  case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
+  case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
+  case PPCISD::STXSIX:          return "PPCISD::STXSIX";
+  case PPCISD::VEXTS:           return "PPCISD::VEXTS";
  case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
  case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
@ -2986,7 +2989,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
          break;
        case MVT::v2f64:
        case MVT::v2i64:
-          RC = &PPC::VSHRCRegClass;
+          RC = &PPC::VRRCRegClass;
          break;
        case MVT::v4f64:
          RC = &PPC::QFRCRegClass;
@ -3169,10 +3172,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
  };
-  static const MCPhysReg VSRH[] = {
-    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
-    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
-  };

  const unsigned Num_GPR_Regs = array_lengthof(GPR);
  const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
@ -3448,9 +3447,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
      // passed directly.  The latter are used to implement ELFv2 homogenous
      // vector aggregates.
      if (VR_idx != Num_VR_Regs) {
-        unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
-                        MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
-                        MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
        ++VR_idx;
      } else {
@ -5056,10 +5053,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
  };
-  static const MCPhysReg VSRH[] = {
-    PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
-    PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
-  };

  const unsigned NumGPRs = array_lengthof(GPR);
  const unsigned NumFPRs = 13;
@ -5486,13 +5479,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
          SDValue Load =
              DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
          MemOpChains.push_back(Load.getValue(1));
-
-          unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
-                           Arg.getSimpleValueType() == MVT::v2i64) ?
-                          VSRH[VR_idx] : VR[VR_idx];
-          ++VR_idx;
-
-          RegsToPass.push_back(std::make_pair(VReg, Load));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
        }
        ArgOffset += 16;
        for (unsigned i=0; i<16; i+=PtrByteSize) {
@ -5510,12 +5497,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(

      // Non-varargs Altivec params go into VRs or on the stack.
      if (VR_idx != NumVRs) {
-        unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
-                         Arg.getSimpleValueType() == MVT::v2i64) ?
-                        VSRH[VR_idx] : VR[VR_idx];
-        ++VR_idx;
-
-        RegsToPass.push_back(std::make_pair(VReg, Arg));
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
      } else {
        if (CallConv == CallingConv::Fast)
          ComputePtrOff();
@ -7094,7 +7076,7 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
 }

 static bool isNonConstSplatBV(BuildVectorSDNode *BVN, EVT Type) {
-  if (BVN->getValueType(0) != Type)
+  if (BVN->isConstant() || BVN->getValueType(0) != Type)
    return false;
  auto OpZero = BVN->getOperand(0);
  for (int i = 1, e = BVN->getNumOperands(); i < e; i++)
@ -7230,8 +7212,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
    auto OpZero = BVN->getOperand(0);
    bool CanLoadAndSplat = OpZero.getOpcode() == ISD::LOAD &&
      BVN->isOnlyUserOf(OpZero.getNode());
-    if (Subtarget.isISA3_0() &&
-        isNonConstSplatBV(BVN, MVT::v4i32) && !CanLoadAndSplat)
+    if (Subtarget.isISA3_0() && !CanLoadAndSplat &&
+        (isNonConstSplatBV(BVN, MVT::v4i32) ||
+         isNonConstSplatBV(BVN, MVT::v2i64)))
      return Op;
    return SDValue();
  }
@ -10571,6 +10554,34 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
  SDLoc dl(N);
  SDValue Op(N, 0);

+  SDValue FirstOperand(Op.getOperand(0));
+  bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
+    (FirstOperand.getValueType() == MVT::i8 ||
+     FirstOperand.getValueType() == MVT::i16);
+  if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
+    bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+    bool DstDouble = Op.getValueType() == MVT::f64;
+    unsigned ConvOp = Signed ?
+      (DstDouble ? PPCISD::FCFID  : PPCISD::FCFIDS) :
+      (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
+    SDValue WidthConst =
+      DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
+                            dl, false);
+    LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
+    SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
+    SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
+                                         DAG.getVTList(MVT::f64, MVT::Other),
+                                         Ops, MVT::i8, LDN->getMemOperand());
+
+    // For signed conversion, we need to sign-extend the value in the VSR
+    if (Signed) {
+      SDValue ExtOps[] = { Ld, WidthConst };
+      SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
+      return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
+    } else
+      return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
+  }
+
  // Don't handle ppc_fp128 here or i1 conversions.
  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
    return SDValue();
@ -10783,10 +10794,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
  case ISD::UINT_TO_FP:
    return combineFPToIntToFP(N, DCI);
  case ISD::STORE: {
+    EVT Op1VT = N->getOperand(1).getValueType();
+    bool ValidTypeForStoreFltAsInt = (Op1VT == MVT::i32) ||
+      (Subtarget.hasP9Vector() && (Op1VT == MVT::i8 || Op1VT == MVT::i16));
+
    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
    if (Subtarget.hasSTFIWX() && !cast<StoreSDNode>(N)->isTruncatingStore() &&
        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
-        N->getOperand(1).getValueType() == MVT::i32 &&
+        ValidTypeForStoreFltAsInt &&
        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
      SDValue Val = N->getOperand(1).getOperand(0);
      if (Val.getValueType() == MVT::f32) {
@ -10796,15 +10811,31 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
      DCI.AddToWorklist(Val.getNode());

-      SDValue Ops[] = {
-        N->getOperand(0), Val, N->getOperand(2),
-        DAG.getValueType(N->getOperand(1).getValueType())
-      };
+      if (Op1VT == MVT::i32) {
+        SDValue Ops[] = {
+          N->getOperand(0), Val, N->getOperand(2),
+          DAG.getValueType(N->getOperand(1).getValueType())
+        };
+
+        Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+                DAG.getVTList(MVT::Other), Ops,
+                cast<StoreSDNode>(N)->getMemoryVT(),
+                cast<StoreSDNode>(N)->getMemOperand());
+      } else {
+        unsigned WidthInBytes =
+          N->getOperand(1).getValueType() == MVT::i8 ? 1 : 2;
+        SDValue WidthConst = DAG.getIntPtrConstant(WidthInBytes, dl, false);
+
+        SDValue Ops[] = {
+          N->getOperand(0), Val, N->getOperand(2), WidthConst,
+          DAG.getValueType(N->getOperand(1).getValueType())
+        };
+        Val = DAG.getMemIntrinsicNode(PPCISD::STXSIX, dl,
+                                      DAG.getVTList(MVT::Other), Ops,
+                                      cast<StoreSDNode>(N)->getMemoryVT(),
+                                      cast<StoreSDNode>(N)->getMemOperand());
+      }

-      Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
-              DAG.getVTList(MVT::Other), Ops,
-              cast<StoreSDNode>(N)->getMemoryVT(),
-              cast<StoreSDNode>(N)->getMemOperand());
      DCI.AddToWorklist(Val.getNode());
      return Val;
    }
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@ -50,6 +50,10 @@ namespace llvm {
      /// unsigned integers.
      FCTIDUZ, FCTIWUZ,

+      /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
+      /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
+      VEXTS,
+
      /// Reciprocal estimate instructions (unary FP ops).
      FRE, FRSQRTE,

@ -365,6 +369,16 @@ namespace llvm {
      /// destination 64-bit register.
      LFIWZX,

+      /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
+      /// integer smaller than 64 bits into a VSR. The integer is zero-extended.
+      /// This can be used for converting loaded integers to floating point.
+      LXSIZX,
+
+      /// STXSIX - The STXSI[bh]X instruction. The first operand is an input
+      /// chain, then an f64 value to store, then an address to store it to,
+      /// followed by a byte-width for the store.
+      STXSIX,
+
      /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
      /// Maps directly to an lxvd2x instruction that will be followed by
      /// an xxswapd.
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@ -706,6 +706,12 @@ def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
                      "vspltw $vD, $vB, $UIMM", IIC_VecPerm,
                      [(set v16i8:$vD, 
                        (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+let isCodeGenOnly = 1 in {
+  def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
+                         "vspltb $vD, $vB, $UIMM", IIC_VecPerm, []>;
+  def VSPLTHs : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB),
+                         "vsplth $vD, $vB, $UIMM", IIC_VecPerm, []>;
+}

 def VSR    : VX1_Int_Ty< 708, "vsr"  , int_ppc_altivec_vsr,  v4i32>;
 def VSRO   : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>;
@ -1270,6 +1276,9 @@ def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>;
 class VX_VT5_EO5_VB5<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
  : VXForm_RD5_XO5_RS5<xo, eo, (outs vrrc:$vD), (ins vrrc:$vB),
                       !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;
+class VX_VT5_EO5_VB5s<bits<11> xo, bits<5> eo, string opc, list<dag> pattern>
+  : VXForm_RD5_XO5_RS5<xo, eo, (outs vfrc:$vD), (ins vfrc:$vB),
+                       !strconcat(opc, " $vD, $vB"), IIC_VecGeneral, pattern>;

 // Vector Count Leading/Trailing Zero LSB. Result is placed into GPR[rD]
 def VCLZLSBB : VXForm_RD5_XO5_RS5<1538, 0, (outs g8rc:$rD), (ins vrrc:$vB),
@ -1292,6 +1301,13 @@ def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>;
 def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>;
 def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>;
 def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>;
+let isCodeGenOnly = 1 in {
+  def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>;
+  def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>;
+  def VEXTSB2Ds : VX_VT5_EO5_VB5s<1538, 24, "vextsb2d", []>;
+  def VEXTSH2Ds : VX_VT5_EO5_VB5s<1538, 25, "vextsh2d", []>;
+  def VEXTSW2Ds : VX_VT5_EO5_VB5s<1538, 26, "vextsw2d", []>;
+}

 // Vector Integer Negate
 def VNEGW : VX_VT5_EO5_VB5<1538, 6, "vnegw", []>;
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@ -38,6 +38,14 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
  let TSFlags{2}   = PPC970_Cracked;
  let TSFlags{5-3} = PPC970_Unit;

+  /// Indicate that the VSX instruction is to use VSX numbering/encoding.
+  /// Since ISA 3.0, there are scalar instructions that use the upper
+  /// half of the VSX register set only. Rather than adding further complexity
+  /// to the register class set, the VSX registers just include the Altivec
+  /// registers and this flag decides the numbering to be used for them.
+  bits<1> UseVSXReg = 0;
+  let TSFlags{6}   = UseVSXReg;
+
  // Fields used for relation models.
  string BaseName = "";

@ -62,6 +70,8 @@ class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
 class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
 class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }

+class UseVSXReg { bits<1> UseVSXReg = 1; }
+
 // Two joined instructions; used to emit two adjacent instructions as one.
 // The itinerary from the first instruction is used for scheduling and
 // classification.
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@ -858,15 +858,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    if (VSXSelfCopyCrash && SrcReg == SuperReg)
      llvm_unreachable("nop VSX copy");

-    DestReg = SuperReg;
-  } else if (PPC::VRRCRegClass.contains(DestReg) &&
-             PPC::VSRCRegClass.contains(SrcReg)) {
-    unsigned SuperReg =
-      TRI->getMatchingSuperReg(DestReg, PPC::sub_128, &PPC::VSRCRegClass);
-
-    if (VSXSelfCopyCrash && SrcReg == SuperReg)
-      llvm_unreachable("nop VSX copy");
-
    DestReg = SuperReg;
  } else if (PPC::F8RCRegClass.contains(SrcReg) &&
             PPC::VSRCRegClass.contains(DestReg)) {
@ -876,15 +867,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
    if (VSXSelfCopyCrash && DestReg == SuperReg)
      llvm_unreachable("nop VSX copy");

-    SrcReg = SuperReg;
-  } else if (PPC::VRRCRegClass.contains(SrcReg) &&
-             PPC::VSRCRegClass.contains(DestReg)) {
-    unsigned SuperReg =
-      TRI->getMatchingSuperReg(SrcReg, PPC::sub_128, &PPC::VSRCRegClass);
-
-    if (VSXSelfCopyCrash && DestReg == SuperReg)
-      llvm_unreachable("nop VSX copy");
-
    SrcReg = SuperReg;
  }

@ -1073,6 +1055,15 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
  FuncInfo->setHasSpills();

+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be different if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  RC = updatedRC(RC);
+
  bool NonRI = false, SpillsVRS = false;
  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
                          NonRI, SpillsVRS))
@ -1185,6 +1176,16 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
  FuncInfo->setHasSpills();

+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be different if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
+    RC = &PPC::VSRCRegClass;
+
  bool NonRI = false, SpillsVRS = false;
  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
                           NonRI, SpillsVRS))
@ -1884,3 +1885,10 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
  }
  return false;
 }
+
+const TargetRegisterClass *
+PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
+  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
+    return &PPC::VSRCRegClass;
+  return RC;
+}
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@ -61,6 +61,15 @@ enum PPC970_Unit {
  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
 };
+
+enum {
+  /// Shift count to bypass PPC970 flags
+  NewDef_Shift = 6,
+
+  /// The VSX instruction that uses VSX register (vs0-vs63), instead of VMX
+  /// register (v0-v31).
+  UseVSXReg = 0x1 << NewDef_Shift
+};
 } // end namespace PPCII

 class PPCSubtarget;
@ -273,6 +282,14 @@ public:

  // Lower pseudo instructions after register allocation.
  bool expandPostRAPseudo(MachineInstr &MI) const override;
+
+  static bool isVFRegister(unsigned Reg) {
+    return Reg >= PPC::VF0 && Reg <= PPC::VF31;
+  }
+  static bool isVRRegister(unsigned Reg) {
+    return Reg >= PPC::V0 && Reg <= PPC::V31;
+  }
+  const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
 };

 }
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@ -23,6 +23,15 @@ def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
 def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
  SDTCisVT<0, f64>, SDTCisPtrTy<1>
 ]>;
+def SDT_PPCLxsizx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCstxsix : SDTypeProfile<0, 3, [
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
+]>;
+def SDT_PPCVexts  : SDTypeProfile<1, 2, [
+  SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
+]>;

 def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
 def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
@ -108,6 +117,11 @@ def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
                       [SDNPHasChain, SDNPMayLoad]>;
 def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
                       [SDNPHasChain, SDNPMayLoad]>;
+def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
+                       [SDNPHasChain, SDNPMayStore]>;
+def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;

 // Extract FPSCR (not modeled at the DAG level).
 def PPCmffs   : SDNode<"PPCISD::MFFS",
@ -445,6 +459,12 @@ def PPCRegVRRCAsmOperand : AsmOperandClass {
 def vrrc : RegisterOperand<VRRC> {
  let ParserMatchClass = PPCRegVRRCAsmOperand;
 }
+def PPCRegVFRCAsmOperand : AsmOperandClass {
+  let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
+}
+def vfrc : RegisterOperand<VFRC> {
+  let ParserMatchClass = PPCRegVFRCAsmOperand;
+}
 def PPCRegCRBITRCAsmOperand : AsmOperandClass {
  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
 }
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@ -89,6 +89,22 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
  }
 }

+// Instruction form with a single input register for instructions such as
+// XXPERMDI. The reason for defining this is that specifying multiple chained
+// operands (such as loads) to an instruction will perform both chained
+// operations rather than coalescing them into a single register - even though
+// the source memory location is the same. This simply forces the instruction
+// to use the same register for both inputs.
+// For example, an output DAG such as this:
+//   (XXPERMDI (LXSIBZX xoaddr:$src), (LXSIBZX xoaddr:$src ), 0))
+// would result in two load instructions emitted and used as separate inputs
+// to the XXPERMDI instruction.
+class XX3Form_2s<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : XX3Form_2<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+    let XB = XA;
+}
+
 def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
 def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
 def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
@ -96,6 +112,7 @@ def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;

 let Predicates = [HasVSX] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let UseVSXReg = 1 in {
 let hasSideEffects = 0 in { // VSX instructions don't have side effects.
 let Uses = [RM] in {

@ -783,6 +800,9 @@ let Uses = [RM] in {
  def XXPERMDI : XX3Form_2<60, 10,
                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+  let isCodeGenOnly = 1 in
+  def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vfrc:$XA, u2imm:$DM),
+                             "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
  def XXSEL : XX4Form<60, 3,
                      (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC),
                      "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>;
@ -797,7 +817,12 @@ let Uses = [RM] in {
                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm,
                       [(set v4i32:$XT,
                             (PPCxxsplt v4i32:$XB, imm32SExt16:$UIM))]>;
+  let isCodeGenOnly = 1 in
+  def XXSPLTWs : XX2Form_2<60, 164,
+                       (outs vsrc:$XT), (ins vfrc:$XB, u2imm:$UIM),
+                       "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
 } // hasSideEffects
+} // UseVSXReg = 1

 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
@ -849,6 +874,12 @@ def : InstAlias<"xxmrgld $XT, $XA, $XB",
                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
 def : InstAlias<"xxswapd $XT, $XB",
                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDIs vsrc:$XT, vfrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDIs vsrc:$XT, vfrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDIs vsrc:$XT, vfrc:$XB, 2)>;

 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.

@ -1071,6 +1102,22 @@ def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
 } // AddedComplexity
 } // HasVSX

+def ScalarLoads {
+  dag Li8 =       (i32 (extloadi8 xoaddr:$src));
+  dag ZELi8 =     (i32 (zextloadi8 xoaddr:$src));
+  dag ZELi8i64 =  (i64 (zextloadi8 xoaddr:$src));
+  dag SELi8 =     (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
+  dag SELi8i64 =  (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));
+
+  dag Li16 =      (i32 (extloadi16 xoaddr:$src));
+  dag ZELi16 =    (i32 (zextloadi16 xoaddr:$src));
+  dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
+  dag SELi16 =    (i32 (sextloadi16 xoaddr:$src));
+  dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));
+
+  dag Li32 = (i32 (load xoaddr:$src));
+}
+
 // The following VSX instructions were introduced in Power ISA 2.07
 /* FIXME: if the operands are v2i64, these patterns will not match.
   we should define new patterns or otherwise match the same patterns
@ -1080,7 +1127,7 @@ def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
 def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
 let Predicates = [HasP8Vector] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
-  let isCommutable = 1 in {
+  let isCommutable = 1, UseVSXReg = 1 in {
    def XXLEQV : XX3Form<60, 186,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                         "xxleqv $XT, $XA, $XB", IIC_VecGeneral,
@ -1090,11 +1137,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                          "xxlnand $XT, $XA, $XB", IIC_VecGeneral,
                          [(set v4i32:$XT, (vnot_ppc (and v4i32:$XA,
                                                    v4i32:$XB)))]>;
-  } // isCommutable
+  } // isCommutable, UseVSXReg

  def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
            (XXLEQV $A, $B)>;

+  let UseVSXReg = 1 in {
  def XXLORC : XX3Form<60, 170,
                       (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                       "xxlorc $XT, $XA, $XB", IIC_VecGeneral,
@ -1122,6 +1170,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                          "stxsiwx $XT, $dst", IIC_LdStSTFD,
                          [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
  } // mayStore
+  } // UseVSXReg = 1

  def : Pat<(f64 (extloadf32 xoaddr:$src)),
            (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>;
@ -1149,6 +1198,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
            (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;

+  let UseVSXReg = 1 in {
  // VSX Elementary Scalar FP arithmetic (SP)
  let isCommutable = 1 in {
    def XSADDSP : XX3Form<60, 0,
@ -1273,6 +1323,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                          "xscvdpspn $XT, $XB", IIC_VecFP, []>;
  def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
                          "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+  } // UseVSXReg = 1

  let Predicates = [IsLittleEndian] in {
  def : Pat<(f32 (PPCfcfids (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
@ -1295,9 +1346,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
  def : Pat<(f32 (PPCfcfidus (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
  }
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.Li32)),
+            (v4i32 (XXSPLTWs (LXSIWAX xoaddr:$src), 1))>;
 } // AddedComplexity = 400
 } // HasP8Vector

+let UseVSXReg = 1 in {
 let Predicates = [HasDirectMove] in {
  // VSX direct move instructions
  def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
@ -1332,6 +1386,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
                              []>, Requires<[In64BitMode]>;

 } // IsISA3_0, HasDirectMove
+} // UseVSXReg = 1

 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
    the value up into element 0 (both BE and LE). Namely, entities smaller than
@ -1911,6 +1966,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;

+  let UseVSXReg = 1 in {
  // [PO T XO B XO BX /]
  class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
                        list<dag> pattern>
@ -1929,6 +1985,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                  InstrItinClass itin, list<dag> pattern>
    : XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
              !strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
+  } // UseVSXReg = 1

  // [PO VRT VRA VRB XO /]
  class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
@ -1997,7 +2054,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  // DP/QP Compare Exponents
  def XSCMPEXPDP : XX3Form_1<60, 59,
                             (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
-                             "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
+                             "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>,
+                   UseVSXReg;
  def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;

  // DP Compare ==, >=, >, !=
@ -2011,6 +2069,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                                  IIC_FPCompare, []>;
  def XSCMPNEDP : XX3_XT5_XA5_XB5<60, 27, "xscmpnedp", vsrc, vsfrc, vsfrc,
                                  IIC_FPCompare, []>;
+  let UseVSXReg = 1 in {
  // Vector Compare Not Equal
  def XVCMPNEDP  : XX3Form_Rc<60, 123,
                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
@ -2028,12 +2087,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
                              (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                              "xvcmpnesp. $XT, $XA, $XB", IIC_VecFPCompare, []>,
                              isDOT;
+  } // UseVSXReg = 1

  //===--------------------------------------------------------------------===//
  // Quad-Precision Floating-Point Conversion Instructions:

  // Convert DP -> QP
-  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vsfrc, []>;
+  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc, []>;

  // Round & Convert QP -> DP (dword[1] is set to zero)
  def XSCVQPDP  : X_VT5_XO5_VB5   <63, 20, 836, "xscvqpdp" , []>;
@ -2046,8 +2106,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  def XSCVQPUWZ : X_VT5_XO5_VB5<63,  1, 836, "xscvqpuwz", []>;

  // Convert (Un)Signed DWord -> QP
-  def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vsfrc, []>;
-  def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vsfrc, []>;
+  def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
+  def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vfrc, []>;

  //===--------------------------------------------------------------------===//
  // Round to Floating-Point Integer Instructions
@ -2084,7 +2144,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  // Insert Exponent DP/QP
  // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
  def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
-                          "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
+                          "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>, UseVSXReg;
  // vB NOTE: only vB.dword[0] is used, that's why we don't use
  //          X_VT5_VA5_VB5 form
  def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
@ -2093,10 +2153,12 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  // Extract Exponent/Significand DP/QP
  def XSXEXPDP : XX2_RT5_XO5_XB6<60,  0, 347, "xsxexpdp", []>;
  def XSXSIGDP : XX2_RT5_XO5_XB6<60,  1, 347, "xsxsigdp", []>;
+
  def XSXEXPQP : X_VT5_XO5_VB5  <63,  2, 804, "xsxexpqp", []>;
  def XSXSIGQP : X_VT5_XO5_VB5  <63, 18, 804, "xsxsigqp", []>;

  // Vector Insert Word
+  let UseVSXReg = 1 in {
  // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
  def XXINSERTW   :
    XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
@ -2110,6 +2172,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
                                  (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
                                  "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
+  } // UseVSXReg = 1

  // Vector Insert Exponent DP/SP
  def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
@ -2126,23 +2189,27 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  //===--------------------------------------------------------------------===//

  // Test Data Class SP/DP/QP
+  let UseVSXReg = 1 in {
  def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
                              "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
  def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
                              "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
+  } // UseVSXReg = 1
  def XSTSTDCQP : X_BF3_DCMX7_RS5  <63, 708,
                              (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
                              "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;

  // Vector Test Data Class SP/DP
+  let UseVSXReg = 1 in {
  def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
                              "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP, []>;
  def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
                              "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP, []>;
+  } // UseVSXReg = 1

  //===--------------------------------------------------------------------===//

@ -2173,7 +2240,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {

  // Vector Splat Immediate Byte
  def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
-                            "xxspltib $XT, $IMM8", IIC_VecPerm, []>;
+                            "xxspltib $XT, $IMM8", IIC_VecPerm, []>, UseVSXReg;

  //===--------------------------------------------------------------------===//
  // Vector/Scalar Load/Store Instructions
@ -2181,12 +2248,12 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  let mayLoad = 1 in {
  // Load Vector
  def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
-                            "lxv $XT, $src", IIC_LdStLFD, []>;
+                            "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg;
  // Load DWord
-  def LXSD  : DSForm_1<57, 2, (outs vrrc:$vD), (ins memrix:$src),
+  def LXSD  : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
                       "lxsd $vD, $src", IIC_LdStLFD, []>;
  // Load SP from src, convert it to DP, and place in dword[0]
-  def LXSSP : DSForm_1<57, 3, (outs vrrc:$vD), (ins memrix:$src),
+  def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src),
                       "lxssp $vD, $src", IIC_LdStLFD, []>;

  // [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
@ -2194,11 +2261,13 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
                      RegisterOperand vtype, list<dag> pattern>
    : XX1Form<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
-              !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>;
+              !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>, UseVSXReg;

  // Load as Integer Byte/Halfword & Zero Indexed
-  def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc, []>;
-  def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc, []>;
+  def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
+                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
+  def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
+                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;

  // Load Vector Halfword*8/Byte*16 Indexed
  def LXVH8X  : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
@ -2214,28 +2283,34 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {

  // Load Vector Word & Splat Indexed
  def LXVWSX  : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
-  } // end mayLoad
+  } // mayLoad

  let mayStore = 1 in {
  // Store Vector
  def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
-                             "stxv $XT, $dst", IIC_LdStSTFD, []>;
+                             "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg;
  // Store DWord
-  def STXSD  : DSForm_1<61, 2, (outs), (ins vrrc:$vS, memrix:$dst),
+  def STXSD  : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
                        "stxsd $vS, $dst", IIC_LdStSTFD, []>;
  // Convert DP of dword[0] to SP, and Store to dst
-  def STXSSP : DSForm_1<61, 3, (outs), (ins vrrc:$vS, memrix:$dst),
+  def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst),
                        "stxssp $vS, $dst", IIC_LdStSTFD, []>;

  // [PO S RA RB XO SX]
  class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
                      RegisterOperand vtype, list<dag> pattern>
    : XX1Form<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
-              !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>;
+              !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>, UseVSXReg;

  // Store as Integer Byte/Halfword Indexed
-  def STXSIBX  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsfrc, []>;
-  def STXSIHX  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsfrc, []>;
+  def STXSIBX  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsfrc,
+                               [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
+  def STXSIHX  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsfrc,
+                               [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
+  let isCodeGenOnly = 1 in {
+    def STXSIBXv  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vrrc, []>;
+    def STXSIHXv  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vrrc, []>;
+  }

  // Store Vector Halfword*8/Byte*16 Indexed
  def STXVH8X  : X_XS6_RA5_RB5<31,  940, "stxvh8x" , vsrc, []>;
@ -2248,7 +2323,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
  // Store Vector (Left-justified) with Length
  def STXVL    : X_XS6_RA5_RB5<31,  397, "stxvl"   , vsrc, []>;
  def STXVLL   : X_XS6_RA5_RB5<31,  429, "stxvll"  , vsrc, []>;
-  } // end mayStore
+  } // mayStore

  // Patterns for which instructions from ISA 3.0 are a better match
  let Predicates = [IsLittleEndian, HasP9Vector] in {
@ -2341,6 +2416,146 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
            (v4i32 (XXSPLTIB 255))>;
  def : Pat<(v2i64 immAllOnesV),
            (v2i64 (XXSPLTIB 255))>;
+
+  // Build vectors from i8 loads
+  def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
+            (v16i8 (VSPLTBs 7, (LXSIBZX xoaddr:$src)))>;
+  def : Pat<(v8i16 (scalar_to_vector ScalarLoads.ZELi8)),
+            (v8i16 (VSPLTHs 3, (LXSIBZX xoaddr:$src)))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)),
+           (v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)),
+            (v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)),
+            (v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)),
+            (v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>;
+
+  // Build vectors from i16 loads
+  def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)),
+            (v8i16 (VSPLTHs 3, (LXSIHZX xoaddr:$src)))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)),
+            (v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)),
+           (v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>;
+  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)),
+            (v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>;
+  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
+            (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;
+
+  let Predicates = [IsBigEndian, HasP9Vector] in {
+  // Scalar stores of i8
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+            (STXSIBXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+
+  // Scalar stores of i16
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+            (STXSIHXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+  } // IsBigEndian, HasP9Vector
+
+  let Predicates = [IsLittleEndian, HasP9Vector] in {
+  // Scalar stores of i8
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 7), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 5), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 3), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 1), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+            (STXSIBXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 15), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 13), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 11), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+            (STXSIBXv (VSLDOI $S, $S, 9), xoaddr:$dst)>;
+
+  // Scalar stores of i16
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 8), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 6), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 4), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 2), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+            (STXSIHXv $S, xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 14), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 12), xoaddr:$dst)>;
+  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+            (STXSIHXv (VSLDOI $S, $S, 10), xoaddr:$dst)>;
+  } // IsLittleEndian, HasP9Vector
+
+  // Vector sign extensions
+  def : Pat<(f64 (PPCVexts f64:$A, 1)),
+            (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>;
+  def : Pat<(f64 (PPCVexts f64:$A, 2)),
+            (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
 } // end HasP9Vector, AddedComplexity

 let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@ -170,11 +170,68 @@ bool PPCMIPeephole::simplifyCode(void) {
                ToErase = &MI;
                Simplified = true;
              }
+            } else if ((Immed == 0 || Immed == 3) &&
+                       DefMI && DefMI->getOpcode() == PPC::XXPERMDIs) {
+              // Splat fed by another splat - switch the output of the first
+              // and remove the second.
+              DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+              ToErase = &MI;
+              Simplified = true;
+              DEBUG(dbgs() << "Removing redundant splat: ");
+              DEBUG(MI.dump());
            }
          }
        }
        break;
      }
+      case PPC::VSPLTB:
+      case PPC::VSPLTH:
+      case PPC::XXSPLTW: {
+        unsigned MyOpcode = MI.getOpcode();
+        unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
+        unsigned TrueReg = lookThruCopyLike(MI.getOperand(OpNo).getReg());
+        MachineInstr *DefMI = MRI->getVRegDef(TrueReg);
+        if (!DefMI)
+          break;
+        unsigned DefOpcode = DefMI->getOpcode();
+        bool SameOpcode = (MyOpcode == DefOpcode) ||
+          (MyOpcode == PPC::VSPLTB && DefOpcode == PPC::VSPLTBs) ||
+          (MyOpcode == PPC::VSPLTH && DefOpcode == PPC::VSPLTHs) ||
+          (MyOpcode == PPC::XXSPLTW && DefOpcode == PPC::XXSPLTWs);
+        // Splat fed by another splat - switch the output of the first
+        // and remove the second.
+        if (SameOpcode) {
+          DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
+          ToErase = &MI;
+          Simplified = true;
+          DEBUG(dbgs() << "Removing redundant splat: ");
+          DEBUG(MI.dump());
+        }
+        // Splat fed by a shift. Usually when we align value to splat into
+        // vector element zero.
+        if (DefOpcode == PPC::XXSLDWI) {
+          unsigned ShiftRes = DefMI->getOperand(0).getReg();
+          unsigned ShiftOp1 = DefMI->getOperand(1).getReg();
+          unsigned ShiftOp2 = DefMI->getOperand(2).getReg();
+          unsigned ShiftImm = DefMI->getOperand(3).getImm();
+          unsigned SplatImm = MI.getOperand(2).getImm();
+          if (ShiftOp1 == ShiftOp2) {
+            unsigned NewElem = (SplatImm + ShiftImm) & 0x3;
+            if (MRI->hasOneNonDBGUse(ShiftRes)) {
+              DEBUG(dbgs() << "Removing redundant shift: ");
+              DEBUG(DefMI->dump());
+              ToErase = DefMI;
+            }
+            Simplified = true;
+            DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
+                  " to " << NewElem << " in instruction: ");
+            DEBUG(MI.dump());
+            MI.getOperand(1).setReg(ShiftOp1);
+            MI.getOperand(2).setImm(NewElem);
+          }
+        }
+        break;
+      }
      }
    }

--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@ -303,7 +303,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
  case PPC::VRRCRegClassID:
  case PPC::VFRCRegClassID:
  case PPC::VSLRCRegClassID:
-  case PPC::VSHRCRegClassID:
    return 32 - DefaultSafety;
  case PPC::VSRCRegClassID:
  case PPC::VSFRCRegClassID:
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@ -17,7 +17,6 @@ def sub_eq : SubRegIndex<1, 2>;
 def sub_un : SubRegIndex<1, 3>;
 def sub_32 : SubRegIndex<32>;
 def sub_64 : SubRegIndex<64>;
-def sub_128 : SubRegIndex<128>;
 }


@ -79,15 +78,6 @@ class VSRL<FPR SubReg, string n> : PPCReg<n> {
  let SubRegIndices = [sub_64];
 }

-// VSRH - One of the 32 128-bit VSX registers that overlap with the vector
-// registers.
-class VSRH<VR SubReg, string n> : PPCReg<n> {
-  let HWEncoding{4-0} = SubReg.HWEncoding{4-0};
-  let HWEncoding{5} = 1;
-  let SubRegs = [SubReg];
-  let SubRegIndices = [sub_128];
-}
-
 // CR - One of the 8 4-bit condition registers
 class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
  let HWEncoding{2-0} = num;
@ -116,9 +106,12 @@ foreach Index = 0-31 in {
                DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
 }

-// Floating-point vector subregisters (for VSX)
+// 64-bit Floating-point subregisters of Altivec registers
+// Note: the register names are v0-v31 or vs32-vs63 depending on the use.
+//       Custom C++ code is used to produce the correct name and encoding.
 foreach Index = 0-31 in {
-  def VF#Index : VF<Index, "vs" # !add(Index, 32)>;
+  def VF#Index : VF<Index, "v" #Index>,
+                 DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
 }

 // QPX Floating-point registers
@ -138,9 +131,11 @@ foreach Index = 0-31 in {
  def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>,
                  DwarfRegAlias<!cast<FPR>("F"#Index)>;
 }
-foreach Index = 0-31 in {
-  def VSH#Index : VSRH<!cast<VR>("V"#Index), "vs" # !add(Index, 32)>,
-                  DwarfRegAlias<!cast<VR>("V"#Index)>;
+
+// Dummy VSX registers, this defines string: "vs32"-"vs63", and is only used for
+// asm printing.
+foreach Index = 32-63 in {
+  def VSX#Index : PPCReg<"vs"#Index>;
 }

 // The reprsentation of r0 when treated as the constant 0.
@ -288,7 +283,7 @@ def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
                                                (sequence "F%u", 31, 14))>;
 def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;

-def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32], 128,
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64], 128,
                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
@ -298,14 +293,8 @@ def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32], 128,
 def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
                          (add (sequence "VSL%u", 0, 13),
                               (sequence "VSL%u", 31, 14))>;
-def VSHRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
-                          (add VSH2, VSH3, VSH4, VSH5, VSH0, VSH1, VSH6, VSH7,
-			       VSH8, VSH9, VSH10, VSH11, VSH12, VSH13, VSH14,
-                               VSH15, VSH16, VSH17, VSH18, VSH19, VSH31, VSH30,
-                               VSH29, VSH28, VSH27, VSH26, VSH25, VSH24, VSH23,
-                               VSH22, VSH21, VSH20)>;
 def VSRC  : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
-                          (add VSLRC, VSHRC)>;
+                          (add VSLRC, VRRC)>;

 // Register classes for the 64-bit "scalar" VSX subregisters.
 def VFRC :  RegisterClass<"PPC", [f64], 64,
--- a/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/lib/Target/PowerPC/PPCVSXCopy.cpp
@ -101,11 +101,8 @@ protected:
          // This is a copy *to* a VSX register from a non-VSX register.
          Changed = true;

-          const TargetRegisterClass *SrcRC =
-            IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
-                                           &PPC::VSLRCRegClass;
+          const TargetRegisterClass *SrcRC = &PPC::VSLRCRegClass;
          assert((IsF8Reg(SrcMO.getReg(), MRI) ||
-                  IsVRReg(SrcMO.getReg(), MRI) ||
                  IsVSSReg(SrcMO.getReg(), MRI) ||
                  IsVSFReg(SrcMO.getReg(), MRI)) &&
                 "Unknown source for a VSX copy");
@ -116,8 +113,7 @@ protected:
              .addImm(1) // add 1, not 0, because there is no implicit clearing
                         // of the high bits.
              .addOperand(SrcMO)
-              .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128
-                                                   : PPC::sub_64);
+              .addImm(PPC::sub_64);

          // The source of the original copy is now the new virtual register.
          SrcMO.setReg(NewVReg);
@ -126,13 +122,10 @@ protected:
          // This is a copy *from* a VSX register to a non-VSX register.
          Changed = true;

-          const TargetRegisterClass *DstRC =
-            IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass :
-                                           &PPC::VSLRCRegClass;
+          const TargetRegisterClass *DstRC = &PPC::VSLRCRegClass;
          assert((IsF8Reg(DstMO.getReg(), MRI) ||
                  IsVSFReg(DstMO.getReg(), MRI) ||
-                  IsVSSReg(DstMO.getReg(), MRI) ||
-                  IsVRReg(DstMO.getReg(), MRI)) &&
+                  IsVSSReg(DstMO.getReg(), MRI)) &&
                 "Unknown destination for a VSX copy");

          // Copy the VSX value into a new VSX register of the correct subclass.
@ -143,8 +136,7 @@ protected:

          // Transform the original copy into a subregister extraction copy.
          SrcMO.setReg(NewVReg);
-          SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 :
-                                                         PPC::sub_64);
+          SrcMO.setSubReg(PPC::sub_64);
        }
      }

--- a/test/CodeGen/PowerPC/inline-asm-scalar-to-vector-error.ll
+++ b/test/CodeGen/PowerPC/inline-asm-scalar-to-vector-error.ll
@ -7,8 +7,5 @@ define hidden void @f(i32 %x) {
  ; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
  tail call void asm sideeffect "nop", "{vsl1}"(i32 %x) nounwind

-  ; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
-  tail call void asm sideeffect "nop", "{vsh1}"(i32 %x) nounwind
-
  ret void
 }
--- a/test/CodeGen/PowerPC/load-v4i8-improved.ll
+++ b/test/CodeGen/PowerPC/load-v4i8-improved.ll
@ -1,8 +1,7 @@
 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck \
 ; RUN:   -implicit-check-not vmrg -implicit-check-not=vperm %s
 ; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck \
-; RUN:   -implicit-check-not vmrg -implicit-check-not=vperm %s \
-; RUN:   --check-prefix=CHECK-BE
+; RUN:   -implicit-check-not vmrg -implicit-check-not=vperm %s

 define <16 x i8> @test(i32* %s, i32* %t) {
 entry:
@ -11,13 +10,6 @@ entry:
  %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  ret <16 x i8> %2
 ; CHECK-LABEL: test
-; CHECK: lwz [[GPR:[0-9]+]], 0(3)
-; CHECK: mtvsrd [[VSR:[0-9]+]], [[GPR]]
-; CHECK: xxswapd  [[SWP:[0-9]+]], [[VSR]]
-; CHECK: xxspltw 34, [[SWP]], 3
-; CHECK-BE-LABEL: test
-; CHECK-BE: lwz [[GPR:[0-9]+]], 0(3)
-; CHECK-BE: sldi [[SHL:[0-9]+]], [[GPR]], 32
-; CHECK-BE: mtvsrd [[VSR:[0-9]+]], [[SHL]]
-; CHECK-BE: xxspltw 34, [[VSR]], 0
+; CHECK: lxsiwax 34, 0, 3
+; CHECK: xxspltw 34, 34, 1
 }
--- a/test/CodeGen/PowerPC/machine-combiner.ll
+++ b/test/CodeGen/PowerPC/machine-combiner.ll
@ -98,7 +98,6 @@ define <4 x float> @vector_reassociate_adds1(<4 x float> %x0, <4 x float> %x1, <
 ; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
 ; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
 ; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
-; CHECK-PWR:       # kill
 ; CHECK-NEXT:  blr

  %t0 = fadd <4 x float> %x0, %x1
@ -116,7 +115,6 @@ define <4 x float> @vector_reassociate_adds2(<4 x float> %x0, <4 x float> %x1, <
 ; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
 ; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
 ; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
-; CHECK-PWR:       # kill
 ; CHECK-NEXT:  blr

  %t0 = fadd <4 x float> %x0, %x1
@ -134,7 +132,6 @@ define <4 x float> @vector_reassociate_adds3(<4 x float> %x0, <4 x float> %x1, <
 ; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
 ; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
 ; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
-; CHECK-PWR:       # kill
 ; CHECK-NEXT:  blr

  %t0 = fadd <4 x float> %x0, %x1
@ -152,7 +149,6 @@ define <4 x float> @vector_reassociate_adds4(<4 x float> %x0, <4 x float> %x1, <
 ; CHECK-PWR:       xvaddsp [[REG0:[0-9]+]], 34, 35
 ; CHECK-PWR:       xvaddsp [[REG1:[0-9]+]], 36, 37
 ; CHECK-PWR:       xvaddsp 34, [[REG0]], [[REG1]]
-; CHECK-PWR:       # kill
 ; CHECK-NEXT:  blr

  %t0 = fadd <4 x float> %x0, %x1
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@ -63,7 +63,7 @@ entry:
  ret <2 x i64> %splat.splat
 ; CHECK: mtvsrd {{[0-9]+}}, 3
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxspltd [[REG1]], [[REG1]], 0
+; CHECK-LE: xxspltd 34, [[REG1]], 0
 }

 ; Function Attrs: nounwind
@ -75,9 +75,10 @@ entry:
  %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0
  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
  ret <4 x float> %splat.splat
-; CHECK: xscvdpspn {{[0-9]+}}, 1
+; CHECK: xscvdpspn [[REG1:[0-9]+]], 1
+; CHECK: xxspltw 34, [[REG1]]
 ; CHECK-LE: xscvdpspn [[REG1:[0-9]+]], 1
-; CHECK-LE: xxsldwi {{[0-9]+}}, [[REG1]], [[REG1]], 1
+; CHECK-LE: xxspltw 34, [[REG1]]
 }

 ; The optimization to remove stack operations from PPCDAGToDAGISel::Select
--- a/test/CodeGen/PowerPC/power9-moves-and-splats.ll
+++ b/test/CodeGen/PowerPC/power9-moves-and-splats.ll
@ -7,10 +7,18 @@

 define <2 x i64> @test1(i64 %a, i64 %b) {
 entry:
+; The FIXME below is due to the lowering for BUILD_VECTOR needing a re-vamp
+; which will happen in a subsequent patch.
 ; CHECK-LABEL: test1
-; CHECK: mtvsrdd 34, 4, 3
+; FIXME: mtvsrdd 34, 4, 3
+; CHECK: mtvsrd {{[0-9]+}}, 3
+; CHECK: mtvsrd {{[0-9]+}}, 4
+; CHECK: xxmrgld
 ; CHECK-BE-LABEL: test1
-; CHECK-BE: mtvsrdd 34, 3, 4
+; FIXME-BE: mtvsrdd 34, 3, 4
+; CHECK-BE: mtvsrd {{[0-9]+}}, 4
+; CHECK-BE: mtvsrd {{[0-9]+}}, 3
+; CHECK-BE: xxmrghd
  %vecins = insertelement <2 x i64> undef, i64 %a, i32 0
  %vecins1 = insertelement <2 x i64> %vecins, i64 %b, i32 1
  ret <2 x i64> %vecins1
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@ -55,9 +55,12 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind {
 ; CHECK-LE: blr

 ; CHECK-P9-LABEL: @v1i128_increment_by_one
-; CHECK-P9-DAG: li [[R1:r[0-9]+]], 1
-; CHECK-P9-DAG: li [[R2:r[0-9]+]], 0
-; CHECK-P9: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]]
+; The below FIXME is due to the lowering for BUILD_VECTOR that will be fixed
+; in a subsequent patch.
+; FIXME: li [[R1:r[0-9]+]], 1
+; FIXME: li [[R2:r[0-9]+]], 0
+; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]]
+; CHECK-P9: lxvx [[V1:v[0-9]+]]
 ; CHECK-P9: vadduqm v2, v2, [[V1]]
 ; CHECK-P9: blr

--- a/test/CodeGen/PowerPC/select-i1-vs-i1.ll
+++ b/test/CodeGen/PowerPC/select-i1-vs-i1.ll
@ -714,18 +714,12 @@ entry:
  %cond = select i1 %cmp3, <4 x float> %a1, <4 x float> %a2
  ret <4 x float> %cond

-; FIXME: This test (and the other v4f32 tests) should use the same bclr
-; technique as the v2f64 tests below.
-
 ; CHECK-LABEL: @testv4floatslt
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -740,12 +734,9 @@ entry:
 ; CHECK-LABEL: @testv4floatult
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -760,12 +751,9 @@ entry:
 ; CHECK-LABEL: @testv4floatsle
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -780,12 +768,9 @@ entry:
 ; CHECK-LABEL: @testv4floatule
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -800,12 +785,11 @@ entry:
 ; CHECK-LABEL: @testv4floateq
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 35, 35
-; CHECK-DAG: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 34, 34
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bc 12, [[REG1]], .LBB[[BB1:[0-9_]+]]
+; CHECK: vor 3, 2, 2
+; CHECK: .LBB[[BB1]]
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -820,12 +804,9 @@ entry:
 ; CHECK-LABEL: @testv4floatsge
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -840,12 +821,9 @@ entry:
 ; CHECK-LABEL: @testv4floatuge
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -860,12 +838,9 @@ entry:
 ; CHECK-LABEL: @testv4floatsgt
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -880,12 +855,9 @@ entry:
 ; CHECK-LABEL: @testv4floatugt
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -900,12 +872,9 @@ entry:
 ; CHECK-LABEL: @testv4floatne
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 3, 4
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
-; CHECK-DAG: xxlor [[REG2:[0-9]+]], 34, 34
-; CHECK-DAG: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK: bc 12, [[REG1]], .LBB[[BB:[0-9_]+]]
-; CHECK: xxlor [[REG2]], 35, 35
-; CHECK: .LBB[[BB]]:
-; CHECK: xxlor 34, [[REG2]], [[REG2]]
+; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
+; CHECK: bclr 12, [[REG1]], 0
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

@ -1023,7 +992,7 @@ entry:
 ; CHECK: bc 12, [[REG1]], .LBB[[BB55:[0-9_]+]]
 ; CHECK: vor 3, 2, 2
 ; CHECK: .LBB[[BB55]]
-; CHECK: xxlor 34, 35, 35
+; CHECK: vor 2, 3, 3
 ; CHECK: blr
 }

--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@ -66,7 +66,7 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK-NOT: mfspr

 ; CHECK-DAG: stfd
-; CHECK-DAG: stvx
+; CHECK-DAG: stxvd2x

 ; CHECK-DAG: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
 ; CHECK-DAG: std 31, env_sigill@toc@l([[REG]])
@ -82,7 +82,7 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK: .LBB1_4:

 ; CHECK: lfd
-; CHECK: lvx
+; CHECK: lxvd2x
 ; CHECK: ld
 ; CHECK: blr

@ -93,11 +93,11 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK: li 3, 0

 ; CHECK-NOAV: @main
-; CHECK-NOAV-NOT: stvx
+; CHECK-NOAV-NOT: stxvd2x
 ; CHECK-NOAV: bcl
 ; CHECK-NOAV: mflr
 ; CHECK-NOAV: bl foo
-; CHECK-NOAV-NOT: lvx
+; CHECK-NOAV-NOT: lxvd2x
 ; CHECK-NOAV: blr
 }

--- a/test/CodeGen/PowerPC/vsx-args.ll
+++ b/test/CodeGen/PowerPC/vsx-args.ll
@ -1,5 +1,6 @@
 ; RUN: llc -verify-machineinstrs < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mcpu=pwr7 -mattr=+vsx -fast-isel -O0 | \
+; RUN:   FileCheck -check-prefix=CHECK-FISL %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"

@ -13,13 +14,23 @@ entry:

 ; CHECK-LABEL: @main
 ; CHECK-DAG: vor [[V:[0-9]+]], 2, 2
-; CHECK-DAG: xxlor 34, 35, 35
-; CHECK-DAG: xxlor 35, 36, 36
+; CHECK-DAG: vor 2, 3, 3
+; CHECK-DAG: vor 3, 4, 4
 ; CHECK-DAG: vor 4, [[V]], [[V]]
-; CHECK-DAG: bl sv
-; CHECK-DAG: lxvd2x [[VC:[0-9]+]],
+; CHECK: bl sv
+; CHECK: lxvd2x [[VC:[0-9]+]],
 ; CHECK: xvadddp 34, 34, [[VC]]
 ; CHECK: blr
+
+; CHECK-FISL-LABEL: @main
+; CHECK-FISL: stxvd2x 34
+; CHECK-FISL: vor 2, 3, 3
+; CHECK-FISL: vor 3, 4, 4
+; CHECK-FISL: lxvd2x 36
+; CHECK-FISL: bl sv
+; CHECK-FISL: lxvd2x [[VC:[0-9]+]],
+; CHECK-FISL: xvadddp 34, 34, [[VC]]
+; CHECK-FISL: blr
 }

 attributes #0 = { noinline nounwind readnone }
--- a/test/CodeGen/PowerPC/vsx-infl-copy1.ll
+++ b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
@ -11,7 +11,15 @@ entry:
  br label %vector.body

 ; CHECK-LABEL: @_Z8example9Pj
-; CHECK: xxlor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor
+; CHECK: vor

 vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
--- a/test/CodeGen/PowerPC/vsx-p8.ll
+++ b/test/CodeGen/PowerPC/vsx-p8.ll
@ -34,8 +34,7 @@ define <4 x float> @test32u(<4 x float>* %a) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test32u
-; CHECK-FISL: lxvw4x 0, 0, 3
-; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: lxvw4x 34, 0, 3
 ; CHECK-FISL: blr
 }

@ -48,8 +47,7 @@ define void @test33u(<4 x float>* %a, <4 x float> %b) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test33u
-; CHECK-FISL: vor 3, 2, 2
-; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: stxvw4x 34, 0, 3
 ; CHECK-FISL: blr
 }

--- a/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
+++ b/test/CodeGen/PowerPC/vsx-partword-int-loads-and-stores.ll
--- a/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
+++ b/test/CodeGen/PowerPC/vsx-spill-norwstore.ll
@ -4,6 +4,7 @@ target triple = "powerpc64-unknown-linux-gnu"

@.str1 = external unnamed_addr constant [5 x i8], align 1
@.str10 = external unnamed_addr constant [9 x i8], align 1
+@.v2f64 = external unnamed_addr constant <2 x double>, align 16

 ; Function Attrs: nounwind
 define void @main() #0 {
@ -12,6 +13,7 @@ define void @main() #0 {
 ; CHECK: stxvd2x

 entry:
+  %val = load <2 x double>, <2 x double>* @.v2f64, align 16
  %0 = tail call <8 x i16> @llvm.ppc.altivec.vupkhsb(<16 x i8> <i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1>) #0
  %1 = tail call <8 x i16> @llvm.ppc.altivec.vupklsb(<16 x i8> <i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1>) #0
  br i1 false, label %if.then.i68.i, label %check.exit69.i
@ -23,7 +25,7 @@ check.exit69.i:                                   ; preds = %entry
  br i1 undef, label %if.then.i63.i, label %check.exit64.i

 if.then.i63.i:                                    ; preds = %check.exit69.i
-  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str10, i64 0, i64 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str1, i64 0, i64 0)) #0
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str10, i64 0, i64 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str1, i64 0, i64 0), <2 x double> %val) #0
  br label %check.exit64.i

 check.exit64.i:                                   ; preds = %if.then.i63.i, %check.exit69.i
--- a/test/CodeGen/PowerPC/vsx-vec-spill.ll
+++ b/test/CodeGen/PowerPC/vsx-vec-spill.ll
@ -0,0 +1,34 @@
+; RUN: llc < %s -march=ppc64 -mattr=+vsx -verify-machineinstrs | \
+; RUN:   FileCheck %s --check-prefix=VSX
+; RUN: llc < %s -march=ppc64 -mattr=-vsx -verify-machineinstrs | \
+; RUN:   FileCheck %s --check-prefix=NOVSX
+
+define <2 x double> @interleaving_VSX_VMX(
+  <2 x double> %a, <2 x double> %b, <2 x double> %c,
+  <2 x double> %d, <2 x double> %e, <2 x double> %f) {
+entry:
+  tail call void asm sideeffect "# clobbers",
+    "~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() nounwind
+  tail call void @goo(<2 x double> %a) nounwind
+  %add = fadd <2 x double> %a, %b
+  %sub = fsub <2 x double> %a, %b
+  %mul = fmul <2 x double> %add, %sub
+  %add1 = fadd <2 x double> %c, %d
+  %sub2 = fsub <2 x double> %c, %d
+  %mul3 = fmul <2 x double> %add1, %sub2
+  %add4 = fadd <2 x double> %mul, %mul3
+  %add5 = fadd <2 x double> %e, %f
+  %sub6 = fsub <2 x double> %e, %f
+  %mul7 = fmul <2 x double> %add5, %sub6
+  %add8 = fadd <2 x double> %add4, %mul7
+  ret <2 x double> %add8
+; VSX-LABEL: interleaving_VSX_VMX
+; VSX-NOT: stvx
+; VSX-NOT: lvx
+
+; NOVSX-LABEL: interleaving_VSX_VMX
+; NOVSX-NOT: stxvd2x
+; NOVSX-NOT: lxvd2x
+}
+
+declare void @goo(<2 x double>)
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@ -70,10 +70,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test5
-; CHECK-FISL: vor
-; CHECK-FISL: vor
-; CHECK-FISL: xxlxor
-; CHECK-FISL: vor 2
+; CHECK-FISL: xxlxor 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test5
@ -91,10 +88,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test6
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlxor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlxor 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test6
@ -112,10 +106,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test7
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlxor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlxor 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test7
@ -133,10 +124,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test8
-; CHECK-FISL: vor
-; CHECK-FISL: vor
-; CHECK-FISL: xxlor
-; CHECK-FISL: vor 2
+; CHECK-FISL: xxlor 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test8
@ -154,10 +142,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test9
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlor 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test9
@ -175,10 +160,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test10
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlor 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test10
@ -196,10 +178,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test11
-; CHECK-FISL: vor
-; CHECK-FISL: vor
-; CHECK-FISL: xxland
-; CHECK-FISL: vor 2
+; CHECK-FISL: xxland 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test11
@ -217,10 +196,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test12
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxland 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxland 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test12
@ -238,10 +214,7 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test13
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxland 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxland 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test13
@ -260,11 +233,8 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test14
-; CHECK-FISL: vor 4, 3, 3
-; CHECK-FISL: vor 5, 2, 2
-; CHECK-FISL: xxlor 0, 37, 36
-; CHECK-FISL: xxlnor 36, 37, 36
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlor 0, 34, 35
+; CHECK-FISL: xxlnor 34, 34, 35
 ; CHECK-FISL: lis 0, -1
 ; CHECK-FISL: ori 0, 0, 65520
 ; CHECK-FISL: stxvd2x 0, 1, 0
@ -286,17 +256,13 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test15
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlor 36, 36, 37
-; CHECK-FISL: vor 0, 4, 4
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlnor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlor 0, 34, 35
+; CHECK-FISL: xxlor 36, 0, 0
+; CHECK-FISL: xxlnor 0, 34, 35
+; CHECK-FISL: xxlor 34, 0, 0
 ; CHECK-FISL: lis 0, -1
 ; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: stxvd2x 36, 1, 0
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test15
@ -315,17 +281,13 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test16
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlor 36, 36, 37
-; CHECK-FISL: vor 0, 4, 4
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlnor 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlor 0, 34, 35
+; CHECK-FISL: xxlor 36, 0, 0
+; CHECK-FISL: xxlnor 0, 34, 35
+; CHECK-FISL: xxlor 34, 0, 0
 ; CHECK-FISL: lis 0, -1
 ; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: stxvd2x 36, 1, 0
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test16
@ -344,11 +306,8 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test17
-; CHECK-FISL: vor 4, 3, 3
-; CHECK-FISL: vor 5, 2, 2
-; CHECK-FISL: xxlnor 36, 36, 36
-; CHECK-FISL: xxland 36, 37, 36
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlnor 35, 35, 35
+; CHECK-FISL: xxland 34, 34, 35
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test17
@ -367,17 +326,13 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test18
-; CHECK-FISL: vor 4, 3, 3
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlnor 36, 36, 37
-; CHECK-FISL: vor 0, 4, 4
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlandc 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlnor 0, 35, 35
+; CHECK-FISL: xxlor 36, 0, 0
+; CHECK-FISL: xxlandc 0, 34, 35
+; CHECK-FISL: xxlor 34, 0, 0
 ; CHECK-FISL: lis 0, -1
 ; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: stxvd2x 36, 1, 0
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test18
@ -396,17 +351,13 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test19
-; CHECK-FISL: vor 4, 3, 3
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlnor 36, 36, 37
-; CHECK-FISL: vor 0, 4, 4
-; CHECK-FISL: vor 4, 2, 2
-; CHECK-FISL: vor 5, 3, 3
-; CHECK-FISL: xxlandc 36, 36, 37
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: xxlnor 0, 35, 35
+; CHECK-FISL: xxlor 36, 0, 0
+; CHECK-FISL: xxlandc 0, 34, 35
+; CHECK-FISL: xxlor 34, 0, 0
 ; CHECK-FISL: lis 0, -1
 ; CHECK-FISL: ori 0, 0, 65520
-; CHECK-FISL: stvx 0, 1, 0
+; CHECK-FISL: stxvd2x 36, 1, 0
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test19
@ -425,19 +376,9 @@ entry:
 ; CHECK-REG: xxsel 34, 35, 34, {{[0-9]+}}
 ; CHECK-REG: blr

-; FIXME: The fast-isel code is pretty miserable for this one.
-
 ; CHECK-FISL-LABEL: @test20
-; CHECK-FISL: vor 0, 5, 5
-; CHECK-FISL: vor 1, 4, 4
-; CHECK-FISL: vor 6, 3, 3
-; CHECK-FISL: vor 7, 2, 2
-; CHECK-FISL: vor 2, 1, 1
-; CHECK-FISL: vor 3, 0, 0
-; CHECK-FISL: vcmpequw 2, 2, 3
-; CHECK-FISL: vor 0, 2, 2
-; CHECK-FISL: xxsel 32, 38, 39, 32
-; CHECK-FISL: vor 2, 0, 0
+; CHECK-FISL: vcmpequw {{[0-9]+}}, 4, 5
+; CHECK-FISL: xxsel 34, 35, 34, {{[0-9]+}}
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test20
@ -458,13 +399,8 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test21
-; CHECK-FISL: vor 0, 5, 5
-; CHECK-FISL: vor 1, 4, 4
-; CHECK-FISL: vor 6, 3, 3
-; CHECK-FISL: vor 7, 2, 2
-; CHECK-FISL: xvcmpeqsp 32, 33, 32
-; CHECK-FISL: xxsel 32, 38, 39, 32
-; CHECK-FISL: vor 2, 0, 0
+; CHECK-FISL: xvcmpeqsp [[V1:[0-9]+]], 36, 37
+; CHECK-FISL: xxsel 34, 35, 34, [[V1]]
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test21
@ -491,14 +427,14 @@ entry:
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test22
-; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 33, 32
-; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 32, 32
-; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 33, 33
+; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 37, 37
+; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 36, 36
+; CHECK-FISL-DAG: xvcmpeqsp {{[0-9]+}}, 36, 37
 ; CHECK-FISL-DAG: xxlnor
 ; CHECK-FISL-DAG: xxlnor
 ; CHECK-FISL-DAG: xxlor
 ; CHECK-FISL-DAG: xxlor
-; CHECK-FISL: xxsel 0, 38, 39, {{[0-9]+}}
+; CHECK-FISL: xxsel 34, 35, 34, {{[0-9]+}}
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test22
@ -526,11 +462,7 @@ entry:

 ; CHECK-FISL-LABEL: @test23
 ; CHECK-FISL: vcmpequh 4, 4, 5
-; CHECK-FISL: vor 0, 3, 3
-; CHECK-FISL: vor 1, 2, 2
-; CHECK-FISL: vor 6, 4, 4
-; CHECK-FISL: xxsel 32, 32, 33, 38
-; CHECK-FISL: vor 2, 0, 
+; CHECK-FISL: xxsel 34, 35, 34, 36
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test23
@ -552,11 +484,7 @@ entry:

 ; CHECK-FISL-LABEL: @test24
 ; CHECK-FISL: vcmpequb 4, 4, 5
-; CHECK-FISL: vor 0, 3, 3
-; CHECK-FISL: vor 1, 2, 2
-; CHECK-FISL: vor 6, 4, 4
-; CHECK-FISL: xxsel 32, 32, 33, 38
-; CHECK-FISL: vor 2, 0, 0
+; CHECK-FISL: xxsel 34, 35, 34, 36
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test24
@ -682,8 +610,6 @@ define <2 x i64> @test30(<2 x i64>* %a) {
 ; CHECK-FISL-LABEL: @test30
 ; CHECK-FISL: lxvd2x 0, 0, 3
 ; CHECK-FISL: xxlor 34, 0, 0
-; CHECK-FISL: vor 3, 2, 2
-; CHECK-FISL: vor 2, 3, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test30
@ -715,8 +641,7 @@ define <4 x float> @test32(<4 x float>* %a) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test32
-; CHECK-FISL: lxvw4x 0, 0, 3
-; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: lxvw4x 34, 0, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test32
@ -734,8 +659,7 @@ define void @test33(<4 x float>* %a, <4 x float> %b) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test33
-; CHECK-FISL: vor 3, 2, 2
-; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: stxvw4x 34, 0, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test33
@ -770,8 +694,7 @@ define void @test33u(<4 x float>* %a, <4 x float> %b) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test33u
-; CHECK-FISL: vor 3, 2, 2
-; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: stxvw4x 34, 0, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test33u
@ -789,8 +712,7 @@ define <4 x i32> @test34(<4 x i32>* %a) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test34
-; CHECK-FISL: lxvw4x 0, 0, 3
-; CHECK-FISL: xxlor 34, 0, 0
+; CHECK-FISL: lxvw4x 34, 0, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test34
@ -808,8 +730,7 @@ define void @test35(<4 x i32>* %a, <4 x i32> %b) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test35
-; CHECK-FISL: vor 3, 2, 2
-; CHECK-FISL: stxvw4x 35, 0, 3
+; CHECK-FISL: stxvw4x 34, 0, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test35
@ -1086,10 +1007,7 @@ define <2 x i1> @test65(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test65
-; CHECK-FISL: vor 4, 3, 3
-; CHECK-FISL: vor 5, 2, 2
-; CHECK-FISL: vcmpequw 4, 5, 4
-; CHECK-FISL: vor 2, 4, 4
+; CHECK-FISL: vcmpequw 2, 2, 3
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test65
@ -1107,8 +1025,8 @@ define <2 x i1> @test66(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-REG: blr

 ; CHECK-FISL-LABEL: @test66
-; CHECK-FISL: vcmpequw {{[0-9]+}}, 5, 4
-; CHECK-FISL: xxlnor 34, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-FISL: vcmpequw 2, 2, 3
+; CHECK-FISL: xxlnor 34, 34, 34
 ; CHECK-FISL: blr

 ; CHECK-LE-LABEL: @test66