CellSPU:

- Fix fabs, fneg for f32 and f64. - Use BuildVectorSDNode.isConstantSplat, now that the functionality exists - Continue to improve i64 constant lowering. Lower certain special constants to the constant pool when they correspond to SPU's shufb instruction's special mask values. This avoids the overhead of performing a shuffle on a zero-filled vector just to get the special constant when the memory load suffices. llvm-svn: 67067
2024-11-24 03:33:20 +01:00 · 2009-03-17 01:15:45 +00:00 · 2009-03-17 01:15:45 +00:00 · 2c4ac99ef8
commit 2c4ac99ef8
parent b04be1838d
7 changed files with 424 additions and 497 deletions
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@ -200,182 +200,212 @@ namespace {

    return retval;
  }
-}

-namespace {
+  //! Generate the carry-generate shuffle mask.
+  SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;

-//===--------------------------------------------------------------------===//
-/// SPUDAGToDAGISel - Cell SPU-specific code to select SPU machine
-/// instructions for SelectionDAG operations.
-///
-class SPUDAGToDAGISel :
-  public SelectionDAGISel
-{
-  SPUTargetMachine &TM;
-  SPUTargetLowering &SPUtli;
-  unsigned GlobalBaseReg;
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));

-public:
-  explicit SPUDAGToDAGISel(SPUTargetMachine &tm) :
-    SelectionDAGISel(tm),
-    TM(tm),
-    SPUtli(*tm.getTargetLowering())
-  { }
-
-  virtual bool runOnFunction(Function &Fn) {
-    // Make sure we re-emit a set of the global base reg if necessary
-    GlobalBaseReg = 0;
-    SelectionDAGISel::runOnFunction(Fn);
-    return true;
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
  }

-  /// getI32Imm - Return a target constant with the specified value, of type
-  /// i32.
-  inline SDValue getI32Imm(uint32_t Imm) {
-    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  //! Generate the borrow-generate shuffle mask
+  SDValue getBorrowGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
  }

-  /// getI64Imm - Return a target constant with the specified value, of type
-  /// i64.
-  inline SDValue getI64Imm(uint64_t Imm) {
-    return CurDAG->getTargetConstant(Imm, MVT::i64);
-  }
+  //===------------------------------------------------------------------===//
+  /// SPUDAGToDAGISel - Cell SPU-specific code to select SPU machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class SPUDAGToDAGISel :
+    public SelectionDAGISel
+  {
+    SPUTargetMachine &TM;
+    SPUTargetLowering &SPUtli;
+    unsigned GlobalBaseReg;

-  /// getSmallIPtrImm - Return a target constant of pointer type.
-  inline SDValue getSmallIPtrImm(unsigned Imm) {
-    return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
+  public:
+    explicit SPUDAGToDAGISel(SPUTargetMachine &tm) :
+      SelectionDAGISel(tm),
+      TM(tm),
+      SPUtli(*tm.getTargetLowering())
+    { }
+
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnFunction(Fn);
+      return true;
    }

-  SDNode *emitBuildVector(SDValue build_vec) {
-    MVT vecVT = build_vec.getValueType();
-    SDNode *bvNode = build_vec.getNode();
-    DebugLoc dl = bvNode->getDebugLoc();
-
-    // Check to see if this vector can be represented as a CellSPU immediate
-    // constant by invoking all of the instruction selection predicates:
-    if (((vecVT == MVT::v8i16) &&
-         (SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i16).getNode() != 0)) ||
-        ((vecVT == MVT::v4i32) &&
-         ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
-          (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
-          (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
-          (SPU::get_v4i32_imm(bvNode, *CurDAG).getNode() != 0))) ||
-        ((vecVT == MVT::v2i64) &&
-         ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
-          (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
-          (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0))))
-      return Select(build_vec);
-
-    // No, need to emit a constant pool spill:
-    std::vector<Constant*> CV;
-
-    for (size_t i = 0; i < build_vec.getNumOperands(); ++i) {
-      ConstantSDNode *V = dyn_cast<ConstantSDNode > (build_vec.getOperand(i));
-      CV.push_back(const_cast<ConstantInt *> (V->getConstantIntValue()));
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(uint32_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
    }

-    Constant *CP = ConstantVector::get(CV);
-    SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
-    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-    SDValue CGPoolOffset =
-            SPU::LowerConstantPool(CPIdx, *CurDAG,
-                                   SPUtli.getSPUTargetMachine());
-    return SelectCode(CurDAG->getLoad(build_vec.getValueType(), dl,
-                                      CurDAG->getEntryNode(), CGPoolOffset,
-                                      PseudoSourceValue::getConstantPool(), 0,
-                                      false, Alignment));
-  }
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }

-  /// Select - Convert the specified operand from a target-independent to a
-  /// target-specific node if it hasn't already been changed.
-  SDNode *Select(SDValue Op);
-
-  //! Emit the instruction sequence for i64 shl
-  SDNode *SelectSHLi64(SDValue &Op, MVT OpVT);
-
-  //! Emit the instruction sequence for i64 srl
-  SDNode *SelectSRLi64(SDValue &Op, MVT OpVT);
-
-  //! Emit the instruction sequence for i64 sra
-  SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);
-
-  //! Emit the necessary sequence for loading i64 constants:
-  SDNode *SelectI64Constant(SDValue &Op, MVT OpVT);
-
-  //! Returns true if the address N is an A-form (local store) address
-  bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
-                       SDValue &Index);
-
-  //! D-form address predicate
-  bool SelectDFormAddr(SDValue Op, SDValue N, SDValue &Base,
-                       SDValue &Index);
-
-  /// Alternate D-form address using i7 offset predicate
-  bool SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp,
-                        SDValue &Base);
-
-  /// D-form address selection workhorse
-  bool DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Disp,
-                             SDValue &Base, int minOffset, int maxOffset);
-
-  //! Address predicate if N can be expressed as an indexed [r+r] operation.
-  bool SelectXFormAddr(SDValue Op, SDValue N, SDValue &Base,
-                       SDValue &Index);
-
-  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-  /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps) {
-    SDValue Op0, Op1;
-    switch (ConstraintCode) {
-    default: return true;
-    case 'm':   // memory
-      if (!SelectDFormAddr(Op, Op, Op0, Op1)
-          && !SelectAFormAddr(Op, Op, Op0, Op1))
-        SelectXFormAddr(Op, Op, Op0, Op1);
-      break;
-    case 'o':   // offsetable
-      if (!SelectDFormAddr(Op, Op, Op0, Op1)
-          && !SelectAFormAddr(Op, Op, Op0, Op1)) {
-        Op0 = Op;
-        Op1 = getSmallIPtrImm(0);
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
      }
-      break;
-    case 'v':   // not offsetable
-#if 1
-      assert(0 && "InlineAsmMemoryOperand 'v' constraint not handled.");
-#else
-      SelectAddrIdxOnly(Op, Op, Op0, Op1);
-#endif
-      break;
+
+    SDNode *emitBuildVector(SDValue build_vec) {
+      MVT vecVT = build_vec.getValueType();
+      MVT eltVT = vecVT.getVectorElementType();
+      SDNode *bvNode = build_vec.getNode();
+      DebugLoc dl = bvNode->getDebugLoc();
+
+      // Check to see if this vector can be represented as a CellSPU immediate
+      // constant by invoking all of the instruction selection predicates:
+      if (((vecVT == MVT::v8i16) &&
+           (SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i16).getNode() != 0)) ||
+          ((vecVT == MVT::v4i32) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_v4i32_imm(bvNode, *CurDAG).getNode() != 0))) ||
+          ((vecVT == MVT::v2i64) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0))))
+        return Select(build_vec);
+
+      // No, need to emit a constant pool spill:
+      std::vector<Constant*> CV;
+
+      for (size_t i = 0; i < build_vec.getNumOperands(); ++i) {
+        ConstantSDNode *V = dyn_cast<ConstantSDNode > (build_vec.getOperand(i));
+        CV.push_back(const_cast<ConstantInt *> (V->getConstantIntValue()));
+      }
+
+      Constant *CP = ConstantVector::get(CV);
+      SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+      SDValue CGPoolOffset =
+              SPU::LowerConstantPool(CPIdx, *CurDAG,
+                                     SPUtli.getSPUTargetMachine());
+      return SelectCode(CurDAG->getLoad(build_vec.getValueType(), dl,
+                                        CurDAG->getEntryNode(), CGPoolOffset,
+                                        PseudoSourceValue::getConstantPool(), 0,
+                                        false, Alignment));
    }

-    OutOps.push_back(Op0);
-    OutOps.push_back(Op1);
-    return false;
-  }
+    /// Select - Convert the specified operand from a target-independent to a
+    /// target-specific node if it hasn't already been changed.
+    SDNode *Select(SDValue Op);

-  /// InstructionSelect - This callback is invoked by
-  /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
-  virtual void InstructionSelect();
+    //! Emit the instruction sequence for i64 shl
+    SDNode *SelectSHLi64(SDValue &Op, MVT OpVT);

-  virtual const char *getPassName() const {
-    return "Cell SPU DAG->DAG Pattern Instruction Selection";
-  }
+    //! Emit the instruction sequence for i64 srl
+    SDNode *SelectSRLi64(SDValue &Op, MVT OpVT);

-  /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
-  /// this target when scheduling the DAG.
-  virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() {
-    const TargetInstrInfo *II = TM.getInstrInfo();
-    assert(II && "No InstrInfo?");
-    return new SPUHazardRecognizer(*II);
-  }
+    //! Emit the instruction sequence for i64 sra
+    SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);

-  // Include the pieces autogenerated from the target description.
+    //! Emit the necessary sequence for loading i64 constants:
+    SDNode *SelectI64Constant(SDValue &Op, MVT OpVT, DebugLoc dl);
+
+    //! Alternate instruction emit sequence for loading i64 constants
+    SDNode *SelectI64Constant(uint64_t i64const, MVT OpVT, DebugLoc dl);
+
+    //! Returns true if the address N is an A-form (local store) address
+    bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    //! D-form address predicate
+    bool SelectDFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// Alternate D-form address using i7 offset predicate
+    bool SelectDForm2Addr(SDValue Op, SDValue N, SDValue &Disp,
+                          SDValue &Base);
+
+    /// D-form address selection workhorse
+    bool DFormAddressPredicate(SDValue Op, SDValue N, SDValue &Disp,
+                               SDValue &Base, int minOffset, int maxOffset);
+
+    //! Address predicate if N can be expressed as an indexed [r+r] operation.
+    bool SelectXFormAddr(SDValue Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0, Op1;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        if (!SelectDFormAddr(Op, Op, Op0, Op1)
+            && !SelectAFormAddr(Op, Op, Op0, Op1))
+          SelectXFormAddr(Op, Op, Op0, Op1);
+        break;
+      case 'o':   // offsetable
+        if (!SelectDFormAddr(Op, Op, Op0, Op1)
+            && !SelectAFormAddr(Op, Op, Op0, Op1)) {
+          Op0 = Op;
+          Op1 = getSmallIPtrImm(0);
+        }
+        break;
+      case 'v':   // not offsetable
+#if 1
+        assert(0 && "InlineAsmMemoryOperand 'v' constraint not handled.");
+#else
+        SelectAddrIdxOnly(Op, Op, Op0, Op1);
+#endif
+        break;
+      }
+
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "Cell SPU DAG->DAG Pattern Instruction Selection";
+    }
+
+    /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+    /// this target when scheduling the DAG.
+    virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() {
+      const TargetInstrInfo *II = TM.getInstrInfo();
+      assert(II && "No InstrInfo?");
+      return new SPUHazardRecognizer(*II);
+    }
+
+    // Include the pieces autogenerated from the target description.
 #include "SPUGenDAGISel.inc"
-};
-
+  };
 }

 /// InstructionSelect - This callback is invoked by
@ -689,7 +719,7 @@ SPUDAGToDAGISel::Select(SDValue Op) {
    // Catch the i64 constants that end up here. Note: The backend doesn't
    // attempt to legalize the constant (it's useless because DAGCombiner
    // will insert 64-bit constants and we can't stop it).
-    return SelectI64Constant(Op, OpVT);
+    return SelectI64Constant(Op, OpVT, Op.getDebugLoc());
  } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
             && OpVT == MVT::i64) {
    SDValue Op0 = Op.getOperand(0);
@ -747,21 +777,21 @@ SPUDAGToDAGISel::Select(SDValue Op) {
                                      zextShuffle));
  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
    SDNode *CGLoad =
-            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG, dl));
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl));

    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT,
                                      Op.getOperand(0), Op.getOperand(1),
                                      SDValue(CGLoad, 0)));
  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
    SDNode *CGLoad =
-            emitBuildVector(SPU::getBorrowGenerateShufMask(*CurDAG, dl));
+            emitBuildVector(getBorrowGenerateShufMask(*CurDAG, dl));

    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT,
                                      Op.getOperand(0), Op.getOperand(1),
                                      SDValue(CGLoad, 0)));
  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
    SDNode *CGLoad =
-            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG, dl));
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl));

    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, dl, OpVT,
                                      Op.getOperand(0), Op.getOperand(1),
@ -813,6 +843,54 @@ SPUDAGToDAGISel::Select(SDValue Op) {
    if (OpVT == MVT::i64) {
      return SelectSRAi64(Op, OpVT);
    }
+  } else if (Opc == ISD::FNEG
+             && (OpVT == MVT::f64 || OpVT == MVT::v2f64)) {
+    DebugLoc dl = Op.getDebugLoc();
+    // Check if the pattern is a special form of DFNMS:
+    // (fneg (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))
+    SDValue Op0 = Op.getOperand(0);
+    if (Op0.getOpcode() == ISD::FSUB) {
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == ISD::FMUL) {
+        unsigned Opc = SPU::DFNMSf64;
+        if (OpVT == MVT::v2f64)
+          Opc = SPU::DFNMSv2f64;
+
+        return CurDAG->getTargetNode(Opc, dl, OpVT,
+                                     Op00.getOperand(0),
+                                     Op00.getOperand(1),
+                                     Op0.getOperand(1));
+      }
+    }
+
+    SDValue negConst = CurDAG->getConstant(0x8000000000000000ULL, MVT::i64);
+    SDNode *signMask = 0;
+    unsigned Opc = SPU::ORfneg64;
+
+    if (OpVT == MVT::f64) {
+      signMask = SelectI64Constant(negConst, MVT::i64, dl);
+    } else if (OpVT == MVT::v2f64) {
+      Opc = SPU::ORfnegvec;
+      signMask = emitBuildVector(CurDAG->getNode(ISD::BUILD_VECTOR, dl,
+						 MVT::v2i64,
+                                                 negConst, negConst));
+    }
+
+    return CurDAG->getTargetNode(Opc, dl, OpVT,
+				 Op.getOperand(0), SDValue(signMask, 0));
+  } else if (Opc == ISD::FABS) {
+    if (OpVT == MVT::f64) {
+      SDNode *signMask = SelectI64Constant(0x7fffffffffffffffULL, MVT::i64, dl);
+      return CurDAG->getTargetNode(SPU::ANDfabs64, dl, OpVT,
+                                   Op.getOperand(0), SDValue(signMask, 0));
+    } else if (OpVT == MVT::v2f64) {
+      SDValue absConst = CurDAG->getConstant(0x7fffffffffffffffULL, MVT::i64);
+      SDValue absVec = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                                       absConst, absConst);
+      SDNode *signMask = emitBuildVector(absVec);
+      return CurDAG->getTargetNode(SPU::ANDfabsvec, dl, OpVT,
+                                   Op.getOperand(0), SDValue(signMask, 0));
+    }
  } else if (Opc == SPUISD::LDRESULT) {
    // Custom select instructions for LDRESULT
    MVT VT = N->getValueType(0);
@ -1087,13 +1165,17 @@ SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
 /*!
 Do the necessary magic necessary to load a i64 constant
 */
-SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, MVT OpVT) {
+SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, MVT OpVT,
+                                           DebugLoc dl) {
  ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
-  // Currently there's no DL on the input, but won't hurt to pretend.
-  DebugLoc dl = Op.getDebugLoc();
+  return SelectI64Constant(CN->getZExtValue(), OpVT, dl);
+}
+
+SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, MVT OpVT,
+                                           DebugLoc dl) {
  MVT OpVecVT = MVT::getVectorVT(OpVT, 2);
  SDValue i64vec =
-          SPU::LowerSplat_v2i64(OpVecVT, *CurDAG, CN->getZExtValue(), dl);
+          SPU::LowerV2I64Splat(OpVecVT, *CurDAG, Value64, dl);

  // Here's where it gets interesting, because we have to parse out the
  // subtree handed back in i64vec:
@ -1143,8 +1225,11 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, MVT OpVT) {
                                   SDValue(lhsNode, 0), SDValue(rhsNode, 0),
                                   SDValue(shufMaskNode, 0)));

-    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT, 
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
                                 SDValue(shufNode, 0));
+  } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) {
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, dl, OpVT,
+                                 SDValue(emitBuildVector(i64vec), 0));
  } else {
    cerr << "SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec condition\n";
    abort();
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@ -1,5 +1,5 @@
-//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
 //
+//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
@ -1353,7 +1353,7 @@ getVecImm(SDNode *N) {
    }
  }

-  return 0; // All UNDEF: use implicit def.; not Constant node
+  return 0;
 }

 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
@ -1480,131 +1480,30 @@ SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
  return SDValue();
 }

-// If this is a vector of constants or undefs, get the bits.  A bit in
-// UndefBits is set if the corresponding element of the vector is an
-// ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
-// zero.   Return true if this is not an array of constants, false if it is.
-//
-static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2],
-                                       uint64_t UndefBits[2]) {
-  // Start with zero'd results.
-  VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0;
-
-  unsigned EltBitSize = BV->getOperand(0).getValueType().getSizeInBits();
-  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
-    SDValue OpVal = BV->getOperand(i);
-
-    unsigned PartNo = i >= e/2;     // In the upper 128 bits?
-    unsigned SlotNo = e/2 - (i & (e/2-1))-1;  // Which subpiece of the uint64_t.
-
-    uint64_t EltBits = 0;
-    if (OpVal.getOpcode() == ISD::UNDEF) {
-      uint64_t EltUndefBits = ~0ULL >> (64-EltBitSize);
-      UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize);
-      continue;
-    } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
-      EltBits = CN->getZExtValue() & (~0ULL >> (64-EltBitSize));
-    } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
-      const APFloat &apf = CN->getValueAPF();
-      EltBits = (CN->getValueType(0) == MVT::f32
-                 ? FloatToBits(apf.convertToFloat())
-                 : DoubleToBits(apf.convertToDouble()));
-    } else {
-      // Nonconstant element.
-      return true;
-    }
-
-    VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize);
-  }
-
-  //printf("%llx %llx  %llx %llx\n",
-  //       VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]);
-  return false;
-}
-
-/// If this is a splat (repetition) of a value across the whole vector, return
-/// the smallest size that splats it.  For example, "0x01010101010101..." is a
-/// splat of 0x01, 0x0101, and 0x01010101.  We return SplatBits = 0x01 and
-/// SplatSize = 1 byte.
-static bool isConstantSplat(const uint64_t Bits128[2],
-                            const uint64_t Undef128[2],
-                            int MinSplatBits,
-                            uint64_t &SplatBits, uint64_t &SplatUndef,
-                            int &SplatSize) {
-  // Don't let undefs prevent splats from matching.  See if the top 64-bits are
-  // the same as the lower 64-bits, ignoring undefs.
-  uint64_t Bits64  = Bits128[0] | Bits128[1];
-  uint64_t Undef64 = Undef128[0] & Undef128[1];
-  uint32_t Bits32  = uint32_t(Bits64) | uint32_t(Bits64 >> 32);
-  uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32);
-  uint16_t Bits16  = uint16_t(Bits32)  | uint16_t(Bits32 >> 16);
-  uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16);
-
-  if ((Bits128[0] & ~Undef128[1]) == (Bits128[1] & ~Undef128[0])) {
-    if (MinSplatBits < 64) {
-
-      // Check that the top 32-bits are the same as the lower 32-bits, ignoring
-      // undefs.
-      if ((Bits64 & (~Undef64 >> 32)) == ((Bits64 >> 32) & ~Undef64)) {
-        if (MinSplatBits < 32) {
-
-          // If the top 16-bits are different than the lower 16-bits, ignoring
-          // undefs, we have an i32 splat.
-          if ((Bits32 & (~Undef32 >> 16)) == ((Bits32 >> 16) & ~Undef32)) {
-            if (MinSplatBits < 16) {
-              // If the top 8-bits are different than the lower 8-bits, ignoring
-              // undefs, we have an i16 splat.
-              if ((Bits16 & (uint16_t(~Undef16) >> 8))
-                  == ((Bits16 >> 8) & ~Undef16)) {
-                // Otherwise, we have an 8-bit splat.
-                SplatBits  = uint8_t(Bits16)  | uint8_t(Bits16 >> 8);
-                SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8);
-                SplatSize = 1;
-                return true;
-              }
-            } else {
-              SplatBits = Bits16;
-              SplatUndef = Undef16;
-              SplatSize = 2;
-              return true;
-            }
-          }
-        } else {
-          SplatBits = Bits32;
-          SplatUndef = Undef32;
-          SplatSize = 4;
-          return true;
-        }
-      }
-    } else {
-      SplatBits = Bits128[0];
-      SplatUndef = Undef128[0];
-      SplatSize = 8;
-      return true;
-    }
-  }
-
-  return false;  // Can't be a splat if two pieces don't match.
-}
-
 //! Lower a BUILD_VECTOR instruction creatively:
 SDValue
 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();
+  MVT EltVT = VT.getVectorElementType();
  DebugLoc dl = Op.getDebugLoc();
-  // If this is a vector of constants or undefs, get the bits.  A bit in
-  // UndefBits is set if the corresponding element of the vector is an
-  // ISD::UNDEF value.  For undefs, the corresponding VectorBits values are
-  // zero.
-  uint64_t VectorBits[2];
-  uint64_t UndefBits[2];
-  uint64_t SplatBits, SplatUndef;
-  int SplatSize;
-  if (GetConstantBuildVectorBits(Op.getNode(), VectorBits, UndefBits)
-      || !isConstantSplat(VectorBits, UndefBits,
-                          VT.getVectorElementType().getSizeInBits(),
-                          SplatBits, SplatUndef, SplatSize))
-    return SDValue();   // Not a constant vector, not a splat.
+  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
+  unsigned minSplatBits = EltVT.getSizeInBits();
+
+  if (minSplatBits < 16)
+    minSplatBits = 16;
+
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, minSplatBits)
+      || minSplatBits < SplatBitSize)
+    return SDValue();   // Wasn't a constant vector or splat exceeded min
+
+  uint64_t SplatBits = APSplatBits.getZExtValue();
+  unsigned SplatSize = SplatBitSize / 8;

  switch (VT.getSimpleVT()) {
  default:
@ -1620,8 +1519,7 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
    SDValue T = DAG.getConstant(Value32, MVT::i32);
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,
-                       DAG.getNode(ISD::BUILD_VECTOR, dl, 
-                                   MVT::v4i32, T, T, T, T));
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T, T, T, T));
    break;
  }
  case MVT::v2f64: {
@ -1636,45 +1534,42 @@ LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  }
  case MVT::v16i8: {
   // 8-bit constants have to be expanded to 16-bits
-   unsigned short Value16 = SplatBits | (SplatBits << 8);
-   SDValue Ops[8];
-   for (int i = 0; i < 8; ++i)
-     Ops[i] = DAG.getConstant(Value16, MVT::i16);
+   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
+   SmallVector<SDValue, 8> Ops;
+
+   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
-                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, Ops, 8));
+                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
  }
  case MVT::v8i16: {
-    unsigned short Value16;
-    if (SplatSize == 2)
-      Value16 = (unsigned short) (SplatBits & 0xffff);
-    else
-      Value16 = (unsigned short) (SplatBits | (SplatBits << 8));
-    SDValue T = DAG.getConstant(Value16, VT.getVectorElementType());
-    SDValue Ops[8];
-    for (int i = 0; i < 8; ++i) Ops[i] = T;
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops, 8);
+    unsigned short Value16 = SplatBits;
+    SDValue T = DAG.getConstant(Value16, EltVT);
+    SmallVector<SDValue, 8> Ops;
+
+    Ops.assign(8, T);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
  }
  case MVT::v4i32: {
-    unsigned int Value = SplatBits;
-    SDValue T = DAG.getConstant(Value, VT.getVectorElementType());
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
  }
  case MVT::v2i32: {
-    unsigned int Value = SplatBits;
-    SDValue T = DAG.getConstant(Value, VT.getVectorElementType());
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
  }
  case MVT::v2i64: {
-    return SPU::LowerSplat_v2i64(VT, DAG, SplatBits, dl);
+    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
  }
  }

  return SDValue();
 }

+/*!
+ */
 SDValue
-SPU::LowerSplat_v2i64(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
-                      DebugLoc dl) {
+SPU::LowerV2I64Splat(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+                     DebugLoc dl) {
  uint32_t upper = uint32_t(SplatVal >> 32);
  uint32_t lower = uint32_t(SplatVal);

@ -1685,10 +1580,6 @@ SPU::LowerSplat_v2i64(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                                   Val, Val, Val, Val));
  } else {
-    SDValue LO32;
-    SDValue HI32;
-    SmallVector<SDValue, 16> ShufBytes;
-    SDValue Result;
    bool upper_special, lower_special;

    // NOTE: This code creates common-case shuffle masks that can be easily
@ -1699,6 +1590,18 @@ SPU::LowerSplat_v2i64(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);

+    // Both upper and lower are special, lower to a constant pool load:
+    if (lower_special && upper_special) {
+      SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
+      return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                         SplatValCN, SplatValCN);
+    }
+
+    SDValue LO32;
+    SDValue HI32;
+    SmallVector<SDValue, 16> ShufBytes;
+    SDValue Result;
+
    // Create lower vector if not a special pattern
    if (!lower_special) {
      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
@ -1721,13 +1624,6 @@ SPU::LowerSplat_v2i64(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
      LO32 = HI32;
    if (upper_special)
      HI32 = LO32;
-    if (lower_special && upper_special) {
-      // Unhappy situation... both upper and lower are special, so punt with
-      // a target constant:
-      SDValue Zero = DAG.getConstant(0, MVT::i32);
-      HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Zero, Zero,
-                                Zero, Zero);
-    }

    for (int i = 0; i < 4; ++i) {
      uint64_t val = 0;
@ -2022,9 +1918,9 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
    }

-    SDValue ShufMaskVec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
-                                      &ShufMask[0],
-                                      sizeof(ShufMask) / sizeof(ShufMask[0]));
+    SDValue ShufMaskVec =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));

    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
@ -2067,28 +1963,28 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      /*NOTREACHED*/
    case MVT::i8: {
      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, factor, factor,
-                              factor, factor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
      break;
    }
    case MVT::i16: {
      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, factor, factor,
-                              factor, factor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
      break;
    }
    case MVT::i32:
    case MVT::f32: {
      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, factor, factor,
-                              factor, factor);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
      break;
    }
    case MVT::i64:
    case MVT::f64: {
      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
-      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                              loFactor, hiFactor, loFactor, hiFactor);
      break;
    }
@ -2164,71 +2060,65 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
  case ISD::ROTR:
  case ISD::ROTL: {
    SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(ShiftVT)
-            ? ISD::ZERO_EXTEND
-            : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, dl, ShiftVT, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            TLI.getShiftAmountTy()));
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
+                       ? ISD::ZERO_EXTEND
+                       : ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    // Replicate lower 8-bits into upper 8:
    SDValue ExpandArg =
      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
                  DAG.getNode(ISD::SHL, dl, MVT::i16,
                              N0, DAG.getConstant(8, MVT::i32)));
+
+    // Truncate back down to i8
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
  }
  case ISD::SRL:
  case ISD::SHL: {
    SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i32));
-    N1Opc = N1.getValueType().bitsLT(ShiftVT)
-            ? ISD::ZERO_EXTEND
-            : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, dl, ShiftVT, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(), ShiftVT));
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::ZERO_EXTEND;
+
+      if (N1.getValueType().bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
  }
  case ISD::SRA: {
    SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getSExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(ShiftVT)
-            ? ISD::SIGN_EXTEND
-            : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, dl, ShiftVT, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
-                            ShiftVT));
+    MVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::SIGN_EXTEND;
+
+      if (N1VT.bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
  }
  case ISD::MUL: {
    SDValue N1 = Op.getOperand(1);
-    unsigned N1Opc;
-    N0 = (N0.getOpcode() != ISD::Constant
-          ? DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0)
-          : DAG.getConstant(cast<ConstantSDNode>(N0)->getZExtValue(),
-                            MVT::i16));
-    N1Opc = N1.getValueType().bitsLT(MVT::i16) ? ISD::SIGN_EXTEND : ISD::TRUNCATE;
-    N1 = (N1.getOpcode() != ISD::Constant
-          ? DAG.getNode(N1Opc, dl, MVT::i16, N1)
-          : DAG.getConstant(cast<ConstantSDNode>(N1)->getSExtValue(),
-                            MVT::i16));
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
    break;
@ -2238,36 +2128,6 @@ static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
  return SDValue();
 }

-//! Generate the carry-generate shuffle mask.
-SDValue SPU::getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
-  SmallVector<SDValue, 16 > ShufBytes;
-
-  // Create the shuffle mask for "rotating" the borrow up one register slot
-  // once the borrow is generated.
-  ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-  ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-  ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-  ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
-                     &ShufBytes[0], ShufBytes.size());
-}
-
-//! Generate the borrow-generate shuffle mask
-SDValue SPU::getBorrowGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
-  SmallVector<SDValue, 16 > ShufBytes;
-
-  // Create the shuffle mask for "rotating" the borrow up one register slot
-  // once the borrow is generated.
-  ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-  ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-  ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-  ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
-                     &ShufBytes[0], ShufBytes.size());
-}
-
 //! Lower byte immediate operations for v16i8 vectors:
 static SDValue
 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
@ -2291,26 +2151,24 @@ LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
  }

  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
-    uint64_t VectorBits[2];
-    uint64_t UndefBits[2];
-    uint64_t SplatBits, SplatUndef;
-    int SplatSize;
+    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
+    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");

-    if (!GetConstantBuildVectorBits(ConstVec.getNode(), VectorBits, UndefBits)
-        && isConstantSplat(VectorBits, UndefBits,
-                           VT.getVectorElementType().getSizeInBits(),
-                           SplatBits, SplatUndef, SplatSize)) {
-      SDValue tcVec[16];
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
+
+    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                              HasAnyUndefs, minSplatBits)
+        && minSplatBits <= SplatBitSize) {
+      uint64_t SplatBits = APSplatBits.getZExtValue();
      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
-      const size_t tcVecSize = sizeof(tcVec) / sizeof(tcVec[0]);
-
-      // Turn the BUILD_VECTOR into a set of target constants:
-      for (size_t i = 0; i < tcVecSize; ++i)
-        tcVec[i] = tc;

+      SmallVector<SDValue, 16> tcVec;
+      tcVec.assign(16, tc);
      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
-                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, 
-                                     tcVec, tcVecSize));
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
    }
  }

@ -2452,7 +2310,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
  }

-  return Op;                    // return unmolested, legalized op
+  return SDValue();
 }

 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
@ -2478,7 +2336,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
  }

-  return Op;                    // return unmolested, legalized
+  return SDValue();
 }

 //! Lower ISD::SETCC
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@ -78,11 +78,9 @@ namespace llvm {

    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
                              const SPUTargetMachine &TM);
-    SDValue LowerSplat_v2i64(MVT OpVT, SelectionDAG &DAG, uint64_t splat,
+    //! Simplify a MVT::v2i64 constant splat to CellSPU-ready form
+    SDValue LowerV2I64Splat(MVT OpVT, SelectionDAG &DAG, uint64_t splat,
                             DebugLoc dl);
-
-    SDValue getBorrowGenerateShufMask(SelectionDAG &DAG, DebugLoc dl);
-    SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl);
  }

  class SPUTargetMachine;            // forward dec'l.
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@ -60,9 +60,6 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
                          unsigned& SrcSR, unsigned& DstSR) const {
  SrcSR = DstSR = 0;  // No sub-registers.

-  // Primarily, ORI and OR are generated by copyRegToReg. But, there are other
-  // cases where we can safely say that what's being done is really a move
-  // (see how PowerPC does this -- it's the model for this code too.)
  switch (MI.getOpcode()) {
  default:
    break;
@ -167,7 +164,7 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
           MI.getOperand(1).isReg() &&
           "invalid SPU OR<type>_<vec> or LR instruction!");
    if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
-      sourceReg = MI.getOperand(0).getReg();
+      sourceReg = MI.getOperand(1).getReg();
      destReg = MI.getOperand(0).getReg();
      return true;
    }
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@ -1258,10 +1258,9 @@ multiclass BitwiseAnd
  def fabs32: ANDInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
                      [/* Intentionally does not match a pattern */]>;

-  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, VECREG:$rB),
+  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
                      [/* Intentionally does not match a pattern */]>;

-  // Could use v4i32, but won't for clarity
  def fabsvec: ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
                       [/* Intentionally does not match a pattern */]>;

@ -1288,10 +1287,11 @@ class ANDCInst<dag OOL, dag IOL, list<dag> pattern>:
    RRForm<0b10000011010, OOL, IOL, "andc\t$rT, $rA, $rB",
           IntegerOp, pattern>;

-class ANDCVecInst<ValueType vectype>:
+class ANDCVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
    ANDCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-             [(set (vectype VECREG:$rT), (and (vectype VECREG:$rA),
-                                              (vnot (vectype VECREG:$rB))))]>;
+             [(set (vectype VECREG:$rT),
+                   (and (vectype VECREG:$rA),
+                        (vnot_frag (vectype VECREG:$rB))))]>;

 class ANDCRegInst<RegisterClass rclass>:
    ANDCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
@ -1309,6 +1309,9 @@ multiclass AndComplement
  def r32:  ANDCRegInst<R32C>;
  def r16:  ANDCRegInst<R16C>;
  def r8:   ANDCRegInst<R8C>;
+
+  // Sometimes, the xor pattern has a bitcast constant:
+  def v16i8_conv: ANDCVecInst<v16i8, vnot_conv>;
 }

 defm ANDC : AndComplement;
@ -1480,6 +1483,17 @@ multiclass BitwiseOr
  def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
                  [/* no pattern */]>;

+  // OR instructions used to negate f32 and f64 quantities.
+
+  def fneg32: ORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                     [/* no pattern */]>;
+
+  def fneg64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                     [/* no pattern */]>;
+
+  def fnegvec: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [/* no pattern, see fneg{32,64} */]>;
+
  // scalar->vector promotion, prefslot2vec:
  def v16i8_i8:  ORPromoteScalar<R8C>;
  def v8i16_i16: ORPromoteScalar<R16C>;
@ -1783,18 +1797,6 @@ multiclass BitwiseExclusiveOr
  def r32:   XORRegInst<R32C>;
  def r16:   XORRegInst<R16C>;
  def r8:    XORRegInst<R8C>;
-
-  // Special forms for floating point instructions.
-  // fneg and fabs require bitwise logical ops to manipulate the sign bit.
-
-  def fneg32: XORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
-                      [/* no pattern */]>;
-
-  def fneg64: XORInst<(outs R64FP:$rT), (ins R64FP:$rA, VECREG:$rB),
-                      [/* no pattern */]>;
-
-  def fnegvec: XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
-                       [/* no pattern, see fneg{32,64} */]>;
 }

 defm XOR : BitwiseExclusiveOr;
@ -4239,33 +4241,36 @@ def FMSv2f64 :
            (fsub (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)),
                  (v2f64 VECREG:$rC)))]>;

-// FNMS: - (a * b - c)
+// DFNMS: - (a * b - c)
 // - (a * b) + c => c - (a * b)
-def FNMSf64 :
-    RRForm<0b01111010110, (outs R64FP:$rT),
-                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
-      "dfnms\t$rT, $rA, $rB", DPrecFP,
-      [(set R64FP:$rT, (fsub R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB)))]>,
+
+class DFNMSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01111010110, OOL, IOL, "dfnms\t$rT, $rA, $rB",
+           DPrecFP, pattern>,
    RegConstraint<"$rC = $rT">,
    NoEncode<"$rC">;

-def : Pat<(fneg (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC)),
-          (FNMSf64 R64FP:$rA, R64FP:$rB, R64FP:$rC)>;
+class DFNMSVecInst<list<dag> pattern>:
+    DFNMSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              pattern>;

-def FNMSv2f64 :
-    RRForm<0b01111010110, (outs VECREG:$rT),
-                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
-      "dfnms\t$rT, $rA, $rB", DPrecFP,
-      [(set (v2f64 VECREG:$rT),
-            (fsub (v2f64 VECREG:$rC),
-                  (fmul (v2f64 VECREG:$rA),
-                        (v2f64 VECREG:$rB))))]>,
-    RegConstraint<"$rC = $rT">,
-    NoEncode<"$rC">;
+class DFNMSRegInst<list<dag> pattern>:
+    DFNMSInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+             pattern>;

-def : Pat<(fneg (fsub (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)),
-                (v2f64 VECREG:$rC))),
-          (FNMSv2f64 VECREG:$rA, VECREG:$rB, VECREG:$rC)>;
+multiclass DFMultiplySubtract
+{
+  def v2f64 : DFNMSVecInst<[(set (v2f64 VECREG:$rT), 
+                                 (fsub (v2f64 VECREG:$rC),
+                                       (fmul (v2f64 VECREG:$rA),
+                                             (v2f64 VECREG:$rB))))]>;
+
+  def f64 : DFNMSRegInst<[(set R64FP:$rT,
+                               (fsub R64FP:$rC,
+                                     (fmul R64FP:$rA, R64FP:$rB)))]>;
+}
+
+defm DFNMS : DFMultiplySubtract;

 // - (a * b + c)
 // - (a * b) - c
@ -4293,35 +4298,21 @@ def FNMAv2f64 :
 //===----------------------------------------------------------------------==//

 def : Pat<(fneg (v4f32 VECREG:$rA)),
-          (XORfnegvec (v4f32 VECREG:$rA),
-                      (v4f32 (ILHUv4i32 0x8000)))>;
+          (ORfnegvec (v4f32 VECREG:$rA),
+                     (v4f32 (ILHUv4i32 0x8000)))>;

 def : Pat<(fneg R32FP:$rA),
-          (XORfneg32 R32FP:$rA, (ILHUr32 0x8000))>;
-
-def : Pat<(fneg (v2f64 VECREG:$rA)),
-          (XORfnegvec (v2f64 VECREG:$rA),
-                      (v2f64 (ANDBIv16i8 (FSMBIv16i8 0x8080), 0x80)))>;
-
-def : Pat<(fneg R64FP:$rA),
-          (XORfneg64 R64FP:$rA,
-                     (ANDBIv16i8 (FSMBIv16i8 0x8080), 0x80))>;
+          (ORfneg32 R32FP:$rA, (ILHUr32 0x8000))>;

 // Floating point absolute value
+// Note: f64 fabs is custom-selected.

 def : Pat<(fabs R32FP:$rA),
          (ANDfabs32 R32FP:$rA, (IOHLr32 (ILHUr32 0x7fff), 0xffff))>;

 def : Pat<(fabs (v4f32 VECREG:$rA)),
          (ANDfabsvec (v4f32 VECREG:$rA),
-                      (v4f32 (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f)))>;
-
-def : Pat<(fabs R64FP:$rA),
-          (ANDfabs64 R64FP:$rA, (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f))>;
-
-def : Pat<(fabs (v2f64 VECREG:$rA)),
-          (ANDfabsvec (v2f64 VECREG:$rA),
-                      (v2f64 (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f)))>;
+                      (IOHLv4i32 (ILHUv4i32 0x7fff), 0xffff))>;

 //===----------------------------------------------------------------------===//
 // Hint for branch instructions:
--- a/test/CodeGen/CellSPU/2009-01-01-BrCond.ll
+++ b/test/CodeGen/CellSPU/2009-01-01-BrCond.ll
@ -8,11 +8,11 @@ target triple = "spu"

 define double @__floatunsidf(i32 %arg_a) nounwind {
 entry:
-	%in = alloca %struct.fp_number_type, align 8		; <%struct.fp_number_type*> [#uses=5]
-	%0 = getelementptr %struct.fp_number_type* %in, i32 0, i32 1		; <i32*> [#uses=1]
+	%in = alloca %struct.fp_number_type, align 16
+	%0 = getelementptr %struct.fp_number_type* %in, i32 0, i32 1
 	store i32 0, i32* %0, align 4
-	%1 = icmp eq i32 %arg_a, 0		; <i1> [#uses=1]
-	%2 = getelementptr %struct.fp_number_type* %in, i32 0, i32 0		; <i32*> [#uses=2]
+	%1 = icmp eq i32 %arg_a, 0
+	%2 = getelementptr %struct.fp_number_type* %in, i32 0, i32 0
 	br i1 %1, label %bb, label %bb1

 bb:		; preds = %entry
@ -26,6 +26,6 @@ bb7:		; preds = %bb5, %bb1, %bb
 	ret double 1.0
 }

-declare i32 @llvm.ctlz.i32(i32) nounwind readnone
+; declare i32 @llvm.ctlz.i32(i32) nounwind readnone

 declare double @__pack_d(%struct.fp_number_type*)
--- a/test/CodeGen/CellSPU/fneg-fabs.ll
+++ b/test/CodeGen/CellSPU/fneg-fabs.ll
@ -1,9 +1,7 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep fsmbi   %t1.s | count 3
 ; RUN: grep 32768   %t1.s | count 2
-; RUN: grep xor     %t1.s | count 4
-; RUN: grep and     %t1.s | count 5
-; RUN: grep andbi   %t1.s | count 3
+; RUN: grep or      %t1.s | count 4
+; RUN: grep and     %t1.s | count 2

 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"