CellSPU:

- Rename fcmp.ll test to fcmp32.ll, start adding new double tests to fcmp64.ll - Fix select_bits.ll test - Capitulate to the DAGCombiner and move i64 constant loads to instruction selection (SPUISelDAGtoDAG.cpp). <rant>DAGCombiner will insert all kinds of 64-bit optimizations after operation legalization occurs and now we have to do most of the work that instruction selection should be doing twice (once to determine if v2i64 build_vector can be handled by SelectCode(), which then runs all of the predicates a second time to select the necessary instructions.) But, CellSPU is a good citizen.</rant> llvm-svn: 62990
2024-11-24 03:33:20 +01:00 · 2009-01-26 03:31:40 +00:00 · 2009-01-26 03:31:40 +00:00 · da9360e77e
commit da9360e77e
parent 48639fe6dc
11 changed files with 556 additions and 304 deletions
--- a/lib/Target/CellSPU/SPU64InstrInfo.td
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td
@ -30,8 +30,8 @@
 // selb instruction definition for i64. Note that the selection mask is
 // a vector, produced by various forms of FSM:
 def SELBr64_cond:
-   SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
-            [/* no pattern */]>;
+  SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
+           [/* no pattern */]>;

 // The generic i64 select pattern, which assumes that the comparison result
 // is in a 32-bit register that contains a select mask pattern (i.e., gather
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@ -254,26 +254,56 @@ public:
  /// getSmallIPtrImm - Return a target constant of pointer type.
  inline SDValue getSmallIPtrImm(unsigned Imm) {
    return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
-  }
+    }

  SDNode *emitBuildVector(SDValue build_vec) {
+    MVT vecVT = build_vec.getValueType();
+    SDNode *bvNode = build_vec.getNode();
+    bool canBeSelected = false;
+
+    // Check to see if this vector can be represented as a CellSPU immediate
+    // constant.
+    if (vecVT == MVT::v8i16) {
+      if (SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i16).getNode() != 0) {
+        canBeSelected = true;
+      }
+    } else if (vecVT == MVT::v4i32) {
+      if ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i32).getNode() != 0)
+          || (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i32).getNode() != 0)
+          || (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i32).getNode() != 0)
+          || (SPU::get_v4i32_imm(bvNode, *CurDAG).getNode() != 0)) {
+        canBeSelected = true;
+      }
+    } else if (vecVT == MVT::v2i64) {
+      if ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0)
+          || (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0)
+          || (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0)) {
+        canBeSelected = true;
+      }
+    }
+
+    if (canBeSelected) {
+      return Select(build_vec);
+    }
+
+    // No, need to emit a constant pool spill:
    std::vector<Constant*> CV;

    for (size_t i = 0; i < build_vec.getNumOperands(); ++i) {
-      ConstantSDNode *V = dyn_cast<ConstantSDNode>(build_vec.getOperand(i));
-      CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+      ConstantSDNode *V = dyn_cast<ConstantSDNode > (build_vec.getOperand(i));
+      CV.push_back(const_cast<ConstantInt *> (V->getConstantIntValue()));
    }

    Constant *CP = ConstantVector::get(CV);
    SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
-    unsigned Alignment = 1 << cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    unsigned Alignment = 1 << cast<ConstantPoolSDNode > (CPIdx)->getAlignment();
    SDValue CGPoolOffset =
            SPU::LowerConstantPool(CPIdx, *CurDAG,
                                   SPUtli.getSPUTargetMachine());
    return SelectCode(CurDAG->getLoad(build_vec.getValueType(),
-                      CurDAG->getEntryNode(), CGPoolOffset,
-                      PseudoSourceValue::getConstantPool(), 0,
-                      false, Alignment));
+                                      CurDAG->getEntryNode(), CGPoolOffset,
+                                      PseudoSourceValue::getConstantPool(), 0,
+                                      false, Alignment));
  }

  /// Select - Convert the specified operand from a target-independent to a
@ -289,6 +319,9 @@ public:
  //! Emit the instruction sequence for i64 sra
  SDNode *SelectSRAi64(SDValue &Op, MVT OpVT);

+  //! Emit the necessary sequence for loading i64 constants:
+  SDNode *SelectI64Constant(SDValue &Op, MVT OpVT);
+
  //! Returns true if the address N is an A-form (local store) address
  bool SelectAFormAddr(SDValue Op, SDValue N, SDValue &Base,
                       SDValue &Index);
@ -652,7 +685,9 @@ SPUDAGToDAGISel::Select(SDValue Op) {

  if (N->isMachineOpcode()) {
    return NULL;   // Already selected.
-  } else if (Opc == ISD::FrameIndex) {
+  }
+
+  if (Opc == ISD::FrameIndex) {
    int FI = cast<FrameIndexSDNode>(N)->getIndex();
    SDValue TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType());
    SDValue Imm0 = CurDAG->getTargetConstant(0, Op.getValueType());
@ -669,6 +704,11 @@ SPUDAGToDAGISel::Select(SDValue Op) {
                                             TFI, Imm0), 0);
      n_ops = 2;
    }
+  } else if (Opc == ISD::Constant && OpVT == MVT::i64) {
+    // Catch the i64 constants that end up here. Note: The backend doesn't
+    // attempt to legalize the constant (it's useless because DAGCombiner
+    // will insert 64-bit constants and we can't stop it).
+    return SelectI64Constant(Op, OpVT);
  } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
             && OpVT == MVT::i64) {
    SDValue Op0 = Op.getOperand(0);
@ -745,27 +785,38 @@ SPUDAGToDAGISel::Select(SDValue Op) {
    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, OpVT,
                                      Op.getOperand(0), Op.getOperand(1),
                                      SDValue(CGLoad, 0)));
-  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
-    SDNode *CGLoad =
-            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));
+  } else if (Opc == ISD::TRUNCATE) {
+    SDValue Op0 = Op.getOperand(0);
+    if ((Op0.getOpcode() == ISD::SRA || Op0.getOpcode() == ISD::SRL)
+        && OpVT == MVT::i32
+        && Op0.getValueType() == MVT::i64) {
+      // Catch the (truncate:i32 ([sra|srl]:i64 arg, c), where c >= 32 to
+      // take advantage of the fact that the upper 32 bits are in the
+      // i32 preferred slot and avoid all kinds of other shuffle gymnastics:
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+      if (CN != 0) {
+        unsigned shift_amt = unsigned(CN->getZExtValue());

-    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, OpVT,
-                                      Op.getOperand(0), Op.getOperand(1),
-                                      SDValue(CGLoad, 0)));
-  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
-    SDNode *CGLoad =
-            emitBuildVector(SPU::getBorrowGenerateShufMask(*CurDAG));
+        if (shift_amt >= 32) {
+          SDNode *hi32 =
+                  CurDAG->getTargetNode(SPU::ORr32_r64, OpVT, Op0.getOperand(0));

-    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, OpVT,
-                                      Op.getOperand(0), Op.getOperand(1),
-                                      SDValue(CGLoad, 0)));
-  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
-    SDNode *CGLoad =
-            emitBuildVector(SPU::getCarryGenerateShufMask(*CurDAG));
+          shift_amt -= 32;
+          if (shift_amt > 0) {
+            // Take care of the additional shift, if present:
+            SDValue shift = CurDAG->getTargetConstant(shift_amt, MVT::i32);
+            unsigned Opc = SPU::ROTMAIr32_i32;
+          
+            if (Op0.getOpcode() == ISD::SRL)
+              Opc = SPU::ROTMr32;

-    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, OpVT,
-                                      Op.getOperand(0), Op.getOperand(1),
-                                      SDValue(CGLoad, 0)));
+            hi32 = CurDAG->getTargetNode(Opc, OpVT, SDValue(hi32, 0), shift);
+          }
+
+          return hi32;
+        }
+      }
+    }
  } else if (Opc == ISD::SHL) {
    if (OpVT == MVT::i64) {
      return SelectSHLi64(Op, OpVT);
@ -1046,6 +1097,70 @@ SPUDAGToDAGISel::SelectSRAi64(SDValue &Op, MVT OpVT) {
  return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(Shift, 0));
 }

+/*!
+ Do the necessary magic necessary to load a i64 constant
+ */
+SDNode *SPUDAGToDAGISel::SelectI64Constant(SDValue& Op, MVT OpVT) {
+  ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
+  MVT OpVecVT = MVT::getVectorVT(OpVT, 2);
+  SDValue i64vec =
+          SPU::LowerSplat_v2i64(OpVecVT, *CurDAG, CN->getZExtValue());
+
+  // Here's where it gets interesting, because we have to parse out the
+  // subtree handed back in i64vec:
+
+  if (i64vec.getOpcode() == ISD::BIT_CONVERT) {
+    // The degenerate case where the upper and lower bits in the splat are
+    // identical:
+    SDValue Op0 = i64vec.getOperand(0);
+    ReplaceUses(i64vec, Op0);
+
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT,
+                                 SDValue(emitBuildVector(Op0), 0));
+  } else if (i64vec.getOpcode() == SPUISD::SHUFB) {
+    SDValue lhs = i64vec.getOperand(0);
+    SDValue rhs = i64vec.getOperand(1);
+    SDValue shufmask = i64vec.getOperand(2);
+
+    if (lhs.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(lhs, lhs.getOperand(0));
+      lhs = lhs.getOperand(0);
+    }
+
+    SDNode *lhsNode = (lhs.getNode()->isMachineOpcode()
+                       ? lhs.getNode()
+                       : emitBuildVector(lhs));
+
+    if (rhs.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(rhs, rhs.getOperand(0));
+      rhs = rhs.getOperand(0);
+    }
+
+    SDNode *rhsNode = (rhs.getNode()->isMachineOpcode()
+                       ? rhs.getNode()
+                       : emitBuildVector(rhs));
+    
+    if (shufmask.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(shufmask, shufmask.getOperand(0));
+      shufmask = shufmask.getOperand(0);
+    }
+
+    SDNode *shufMaskNode = (shufmask.getNode()->isMachineOpcode()
+                            ? shufmask.getNode()
+                            : emitBuildVector(shufmask));
+
+    SDNode *shufNode =
+            Select(CurDAG->getNode(SPUISD::SHUFB, OpVecVT,
+                                   SDValue(lhsNode, 0), SDValue(rhsNode, 0),
+                                   SDValue(shufMaskNode, 0)));
+
+    return CurDAG->getTargetNode(SPU::ORi64_v2i64, OpVT, SDValue(shufNode, 0));
+  } else {
+    cerr << "SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec condition\n";
+    abort();
+  }
+}
+
 /// createSPUISelDag - This pass converts a legalized DAG into a
 /// SPU-specific DAG, ready for instruction scheduling.
 ///
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@ -17,6 +17,7 @@
 #include "SPUFrameInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/CallingConv.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@ -79,6 +80,43 @@ namespace {
    return retval;
  }

+  //! Expand a library call into an actual call DAG node
+  /*!
+   \note
+   This code is taken from SelectionDAGLegalize, since it is not exposed as
+   part of the LLVM SelectionDAG API.
+   */
+
+  SDValue
+  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
+                bool isSigned, SDValue &Hi, SPUTargetLowering &TLI) {
+    // The input chain to this libcall is the entry node of the function.
+    // Legalizing the call will automatically add the previous call to the
+    // dependence.
+    SDValue InChain = DAG.getEntryNode();
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      MVT ArgVT = Op.getOperand(i).getValueType();
+      const Type *ArgTy = ArgVT.getTypeForMVT();
+      Entry.Node = Op.getOperand(i);
+      Entry.Ty = ArgTy;
+      Entry.isSExt = isSigned;
+      Entry.isZExt = !isSigned;
+      Args.push_back(Entry);
+    }
+    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                           TLI.getPointerTy());
+
+    // Splice the libcall in wherever FindInputOutputChains tells us to.
+    const Type *RetTy = Op.getNode()->getValueType(0).getTypeForMVT();
+    std::pair<SDValue, SDValue> CallInfo =
+            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                            CallingConv::C, false, Callee, Args, DAG);
+
+    return CallInfo.first;
+  }
 }

 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
@ -113,7 +151,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);

  // SPU constant load actions are custom lowered:
-  setOperationAction(ISD::Constant,   MVT::i64, Custom);
  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);

@ -128,10 +165,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);

-    // SMUL_LOHI, UMUL_LOHI are not legal for Cell:
-    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-
    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
      MVT StoreVT = (MVT::SimpleValueType) stype;
      setTruncStoreAction(VT, StoreVT, Expand);
@ -179,16 +212,14 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  setOperationAction(ISD::FCOS , MVT::f32, Expand);
  setOperationAction(ISD::FREM , MVT::f32, Expand);

-  // If we're enabling GP optimizations, use hardware square root
+  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
+  // for f32!)
  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  setOperationAction(ISD::FSQRT, MVT::f32, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

-  // Make sure that DAGCombine doesn't insert illegal 64-bit constants
-  setOperationAction(ISD::FABS,  MVT::f64, Custom);
-
  // SPU can do rotate right and left, so legalize it... but customize for i8
  // because instructions don't exist.

@ -254,22 +285,21 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  // Custom lower i128 -> i64 truncates
  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);

-  // SPU has a legal FP -> signed INT instruction
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
+  // to expand to a libcall, hence the custom lowering:
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

  // FDIV on SPU requires custom lowering
-  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // libcall
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall

  // SPU has [U|S]INT_TO_FP
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

@ -338,24 +368,23 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
    MVT VT = (MVT::SimpleValueType)i;

    // add/sub are legal for all supported vector VT's.
-    setOperationAction(ISD::ADD , VT, Legal);
-    setOperationAction(ISD::SUB , VT, Legal);
+    setOperationAction(ISD::ADD,     VT, Legal);
+    setOperationAction(ISD::SUB,     VT, Legal);
    // mul has to be custom lowered.
-    // TODO: v2i64 vector multiply
-    setOperationAction(ISD::MUL , VT, Legal);
+    setOperationAction(ISD::MUL,     VT, Legal);

-    setOperationAction(ISD::AND   , VT, Legal);
-    setOperationAction(ISD::OR    , VT, Legal);
-    setOperationAction(ISD::XOR   , VT, Legal);
-    setOperationAction(ISD::LOAD  , VT, Legal);
-    setOperationAction(ISD::SELECT, VT, Legal);
-    setOperationAction(ISD::STORE,  VT, Legal);
+    setOperationAction(ISD::AND,     VT, Legal);
+    setOperationAction(ISD::OR,      VT, Legal);
+    setOperationAction(ISD::XOR,     VT, Legal);
+    setOperationAction(ISD::LOAD,    VT, Legal);
+    setOperationAction(ISD::SELECT,  VT, Legal);
+    setOperationAction(ISD::STORE,   VT, Legal);

    // These operations need to be expanded:
-    setOperationAction(ISD::SDIV, VT, Expand);
-    setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::UDIV, VT, Expand);
-    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SDIV,    VT, Expand);
+    setOperationAction(ISD::SREM,    VT, Expand);
+    setOperationAction(ISD::UDIV,    VT, Expand);
+    setOperationAction(ISD::UREM,    VT, Expand);

    // Custom lower build_vector, constant pool spills, insert and
    // extract vector elements:
@ -866,31 +895,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  return SDValue();
 }

-//! Custom lower i64 integer constants
-/*!
- This code inserts all of the necessary juggling that needs to occur to load
- a 64-bit constant into a register.
- */
-static SDValue
-LowerConstant(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType();
-
-  if (VT == MVT::i64) {
-    ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
-    SDValue T = DAG.getConstant(CN->getZExtValue(), VT);
-    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
-                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
-  } else {
-    cerr << "LowerConstant: unhandled constant type "
-         << VT.getMVTString()
-         << "\n";
-    abort();
-    /*NOTREACHED*/
-  }
-
-  return SDValue();
-}
-
 //! Custom lower double precision floating point constants
 static SDValue
 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
@ -1564,7 +1568,7 @@ static bool isConstantSplat(const uint64_t Bits128[2],

 //! Lower a BUILD_VECTOR instruction creatively:
 SDValue
-SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();
  // If this is a vector of constants or undefs, get the bits.  A bit in
  // UndefBits is set if the corresponding element of the vector is an
@ -1588,7 +1592,7 @@ SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    abort();
    /*NOTREACHED*/
  case MVT::v4f32: {
-    uint32_t Value32 = SplatBits;
+    uint32_t Value32 = uint32_t(SplatBits);
    assert(SplatSize == 4
           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
@ -1598,7 +1602,7 @@ SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    break;
  }
  case MVT::v2f64: {
-    uint64_t f64val = SplatBits;
+    uint64_t f64val = uint64_t(SplatBits);
    assert(SplatSize == 8
           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
@ -1638,95 +1642,101 @@ SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    return DAG.getNode(ISD::BUILD_VECTOR, VT, T, T);
  }
  case MVT::v2i64: {
-    uint64_t val = SplatBits;
-    uint32_t upper = uint32_t(val >> 32);
-    uint32_t lower = uint32_t(val);
-
-    if (upper == lower) {
-      // Magic constant that can be matched by IL, ILA, et. al.
-      SDValue Val = DAG.getTargetConstant(val, MVT::i64);
-      return DAG.getNode(ISD::BUILD_VECTOR, VT, Val, Val);
-    } else {
-      SDValue LO32;
-      SDValue HI32;
-      SmallVector<SDValue, 16> ShufBytes;
-      SDValue Result;
-      bool upper_special, lower_special;
-
-      // NOTE: This code creates common-case shuffle masks that can be easily
-      // detected as common expressions. It is not attempting to create highly
-      // specialized masks to replace any and all 0's, 0xff's and 0x80's.
-
-      // Detect if the upper or lower half is a special shuffle mask pattern:
-      upper_special = (upper == 0||upper == 0xffffffff||upper == 0x80000000);
-      lower_special = (lower == 0||lower == 0xffffffff||lower == 0x80000000);
-
-      // Create lower vector if not a special pattern
-      if (!lower_special) {
-        SDValue LO32C = DAG.getConstant(lower, MVT::i32);
-        LO32 = DAG.getNode(ISD::BIT_CONVERT, VT,
-                           DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                       LO32C, LO32C, LO32C, LO32C));
-      }
-
-      // Create upper vector if not a special pattern
-      if (!upper_special) {
-        SDValue HI32C = DAG.getConstant(upper, MVT::i32);
-        HI32 = DAG.getNode(ISD::BIT_CONVERT, VT,
-                           DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                       HI32C, HI32C, HI32C, HI32C));
-      }
-
-      // If either upper or lower are special, then the two input operands are
-      // the same (basically, one of them is a "don't care")
-      if (lower_special)
-        LO32 = HI32;
-      if (upper_special)
-        HI32 = LO32;
-      if (lower_special && upper_special) {
-        // Unhappy situation... both upper and lower are special, so punt with
-        // a target constant:
-        SDValue Zero = DAG.getConstant(0, MVT::i32);
-        HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Zero, Zero,
-                                  Zero, Zero);
-      }
-
-      for (int i = 0; i < 4; ++i) {
-        uint64_t val = 0;
-        for (int j = 0; j < 4; ++j) {
-          SDValue V;
-          bool process_upper, process_lower;
-          val <<= 8;
-          process_upper = (upper_special && (i & 1) == 0);
-          process_lower = (lower_special && (i & 1) == 1);
-
-          if (process_upper || process_lower) {
-            if ((process_upper && upper == 0)
-                || (process_lower && lower == 0))
-              val |= 0x80;
-            else if ((process_upper && upper == 0xffffffff)
-                     || (process_lower && lower == 0xffffffff))
-              val |= 0xc0;
-            else if ((process_upper && upper == 0x80000000)
-                     || (process_lower && lower == 0x80000000))
-              val |= (j == 0 ? 0xe0 : 0x80);
-          } else
-            val |= i * 4 + j + ((i & 1) * 16);
-        }
-
-        ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
-      }
-
-      return DAG.getNode(SPUISD::SHUFB, VT, HI32, LO32,
-                         DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                                     &ShufBytes[0], ShufBytes.size()));
-    }
+    return SPU::LowerSplat_v2i64(VT, DAG, SplatBits);
  }
  }

  return SDValue();
 }

+SDValue
+SPU::LowerSplat_v2i64(MVT OpVT, SelectionDAG& DAG, uint64_t SplatVal) {
+  uint32_t upper = uint32_t(SplatVal >> 32);
+  uint32_t lower = uint32_t(SplatVal);
+
+  if (upper == lower) {
+    // Magic constant that can be matched by IL, ILA, et. al.
+    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, OpVT,
+                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                   Val, Val, Val, Val));
+  } else {
+    SDValue LO32;
+    SDValue HI32;
+    SmallVector<SDValue, 16> ShufBytes;
+    SDValue Result;
+    bool upper_special, lower_special;
+
+    // NOTE: This code creates common-case shuffle masks that can be easily
+    // detected as common expressions. It is not attempting to create highly
+    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
+
+    // Detect if the upper or lower half is a special shuffle mask pattern:
+    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
+    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+
+    // Create lower vector if not a special pattern
+    if (!lower_special) {
+      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
+      LO32 = DAG.getNode(ISD::BIT_CONVERT, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                     LO32C, LO32C, LO32C, LO32C));
+    }
+
+    // Create upper vector if not a special pattern
+    if (!upper_special) {
+      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
+      HI32 = DAG.getNode(ISD::BIT_CONVERT, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                     HI32C, HI32C, HI32C, HI32C));
+    }
+
+    // If either upper or lower are special, then the two input operands are
+    // the same (basically, one of them is a "don't care")
+    if (lower_special)
+      LO32 = HI32;
+    if (upper_special)
+      HI32 = LO32;
+    if (lower_special && upper_special) {
+      // Unhappy situation... both upper and lower are special, so punt with
+      // a target constant:
+      SDValue Zero = DAG.getConstant(0, MVT::i32);
+      HI32 = LO32 = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Zero, Zero,
+                                Zero, Zero);
+    }
+
+    for (int i = 0; i < 4; ++i) {
+      uint64_t val = 0;
+      for (int j = 0; j < 4; ++j) {
+        SDValue V;
+        bool process_upper, process_lower;
+        val <<= 8;
+        process_upper = (upper_special && (i & 1) == 0);
+        process_lower = (lower_special && (i & 1) == 1);
+
+        if (process_upper || process_lower) {
+          if ((process_upper && upper == 0)
+                  || (process_lower && lower == 0))
+            val |= 0x80;
+          else if ((process_upper && upper == 0xffffffff)
+                  || (process_lower && lower == 0xffffffff))
+            val |= 0xc0;
+          else if ((process_upper && upper == 0x80000000)
+                  || (process_lower && lower == 0x80000000))
+            val |= (j == 0 ? 0xe0 : 0x80);
+        } else
+          val |= i * 4 + j + ((i & 1) * 16);
+      }
+
+      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
+    }
+
+    return DAG.getNode(SPUISD::SHUFB, OpVT, HI32, LO32,
+                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                                   &ShufBytes[0], ShufBytes.size()));
+  }
+}
+
 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
 /// which the Cell can operate. The code inspects V3 to ascertain whether the
 /// permutation vector, V3, is monotonically increasing with one "exception"
@ -2384,81 +2394,180 @@ static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
  return SDValue();
 }

-//! Lower ISD::FABS
+//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
 /*!
- DAGCombine does the same basic reduction: convert the double to i64 and mask
- off the sign bit. Unfortunately, DAGCombine inserts the i64 constant, which
- CellSPU has to legalize. Hence, the custom lowering.
+ f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
+ All conversions to i64 are expanded to a libcall.
 */
-
-static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
  MVT OpVT = Op.getValueType();
-  MVT IntVT(MVT::i64);
  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();

-  assert(OpVT == MVT::f64 && "LowerFABS: expecting MVT::f64!\n");
+  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
+      || OpVT == MVT::i64) {
+    // Convert f32 / f64 to i32 / i64 via libcall.
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::FP_TO_SINT)
+             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
+             : RTLIB::getFPTOUINT(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }

-  SDValue iABS =
-          DAG.getNode(ISD::AND, IntVT,
-                      DAG.getNode(ISD::BIT_CONVERT, IntVT, Op0),
-                      DAG.getConstant(~IntVT.getIntegerVTSignBit(), IntVT));
+  return SDValue();
+}

-  return DAG.getNode(ISD::BIT_CONVERT, MVT::f64, iABS);
+//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
+/*!
+ i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
+ All conversions from i64 are expanded to a libcall.
+ */
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  MVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
+      || Op0VT == MVT::i64) {
+    // Convert i32, i64 to f64 via libcall:
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::SINT_TO_FP)
+             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
+             : RTLIB::getUINTTOFP(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return SDValue();
 }

 //! Lower ISD::SETCC
 /*!
 This handles MVT::f64 (double floating point) condition lowering
 */
-
 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+
  SDValue lhs = Op.getOperand(0);
  SDValue rhs = Op.getOperand(1);
-  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode > (Op.getOperand(2));
  MVT lhsVT = lhs.getValueType();
-  SDValue posNaN = DAG.getConstant(0x7ff0000000000001ULL, MVT::i64);
-
-  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");

+  MVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+  MVT IntVT(MVT::i64);
+
+  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
+  // selected to a NOP:
+  SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, IntVT, lhs);
+  SDValue lhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, MVT::i32,
+                      DAG.getNode(ISD::SRL, IntVT,
+                                  i64lhs, DAG.getConstant(32, MVT::i32)));
+  SDValue lhsHi32abs =
+          DAG.getNode(ISD::AND, MVT::i32,
+                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue lhsLo32 =
+          DAG.getNode(ISD::TRUNCATE, MVT::i32, i64lhs);
+
+  // SETO and SETUO only use the lhs operand:
+  if (CC->get() == ISD::SETO) {
+    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
+    // SETUO
+    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+    return DAG.getNode(ISD::XOR, ccResultVT,
+                       DAG.getSetCC(ccResultVT,
+                                    lhs, DAG.getConstantFP(0.0, lhsVT),
+                                    ISD::SETUO),
+                       DAG.getConstant(ccResultAllOnes, ccResultVT));
+  } else if (CC->get() == ISD::SETUO) {
+    // Evaluates to true if Op0 is [SQ]NaN
+    return DAG.getNode(ISD::AND, ccResultVT,
+                       DAG.getSetCC(ccResultVT,
+                                    lhsHi32abs,
+                                    DAG.getConstant(0x7ff00000, MVT::i32),
+                                    ISD::SETGE),
+                       DAG.getSetCC(ccResultVT,
+                                    lhsLo32,
+                                    DAG.getConstant(0, MVT::i32),
+                                    ISD::SETGT));
+  }
+
+  SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, IntVT, rhs);
+  SDValue rhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, MVT::i32,
+                      DAG.getNode(ISD::SRL, IntVT,
+                                  i64rhs, DAG.getConstant(32, MVT::i32)));
+
+  // If a value is negative, subtract from the sign magnitude constant:
+  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
+
+  // Convert the sign-magnitude representation into 2's complement:
+  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, ccResultVT,
+                                      lhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, IntVT, signMag2TC, i64lhs);
+  SDValue lhsSelect =
+          DAG.getNode(ISD::SELECT, IntVT,
+                      lhsSelectMask, lhsSignMag2TC, i64lhs);
+
+  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, ccResultVT,
+                                      rhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, IntVT, signMag2TC, i64rhs);
+  SDValue rhsSelect =
+          DAG.getNode(ISD::SELECT, IntVT,
+                      rhsSelectMask, rhsSignMag2TC, i64rhs);
+
+  unsigned compareOp;
+
  switch (CC->get()) {
  case ISD::SETOEQ:
-  case ISD::SETOGT:
-  case ISD::SETOGE:
-  case ISD::SETOLT:
-  case ISD::SETOLE:
-  case ISD::SETONE:
-    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
-    abort();
-    break;
-  case ISD::SETO: {
-    SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
-    SDValue i64lhs =
-            DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
-
-    return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETLT);
-  }
-  case ISD::SETUO: {
-    SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
-    SDValue i64lhs =
-            DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
-
-    return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETGE);
-  }
  case ISD::SETUEQ:
+    compareOp = ISD::SETEQ; break;
+  case ISD::SETOGT:
  case ISD::SETUGT:
+    compareOp = ISD::SETGT; break;
+  case ISD::SETOGE:
  case ISD::SETUGE:
+    compareOp = ISD::SETGE; break;
+  case ISD::SETOLT:
  case ISD::SETULT:
+    compareOp = ISD::SETLT; break;
+  case ISD::SETOLE:
  case ISD::SETULE:
+    compareOp = ISD::SETLE; break;
  case ISD::SETUNE:
+  case ISD::SETONE:
+    compareOp = ISD::SETNE; break;
  default:
    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
    abort();
    break;
  }

-  return SDValue();
+  SDValue result =
+          DAG.getSetCC(ccResultVT, lhsSelect, rhsSelect, (ISD::CondCode) compareOp);
+
+  if ((CC->get() & 0x8) == 0) {
+    // Ordered comparison:
+    SDValue lhsNaN = DAG.getSetCC(ccResultVT,
+                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue rhsNaN = DAG.getSetCC(ccResultVT,
+                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue ordered = DAG.getNode(ISD::AND, ccResultVT, lhsNaN, rhsNaN);
+
+    result = DAG.getNode(ISD::AND, ccResultVT, ordered, result);
+  }
+
+  return result;
 }

 //! Lower ISD::SELECT_CC
@ -2566,8 +2675,6 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
  case ISD::JumpTable:
    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
-  case ISD::Constant:
-    return LowerConstant(Op, DAG);
  case ISD::ConstantFP:
    return LowerConstantFP(Op, DAG);
  case ISD::FORMAL_ARGUMENTS:
@ -2590,12 +2697,17 @@ SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
    break;
  }

-  case ISD::FABS:
-    return LowerFABS(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG, *this);
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG, *this);

  // Vector-related lowering.
  case ISD::BUILD_VECTOR:
-    return SPU::LowerBUILD_VECTOR(Op, DAG);
+    return LowerBUILD_VECTOR(Op, DAG);
  case ISD::SCALAR_TO_VECTOR:
    return LowerSCALAR_TO_VECTOR(Op, DAG);
  case ISD::VECTOR_SHUFFLE:
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@ -61,7 +61,7 @@ namespace llvm {
    };
  }

-  //! Utility functions specific to CellSPU-only:
+  //! Utility functions specific to CellSPU:
  namespace SPU {
    SDValue get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
                             MVT ValueType);
@ -78,7 +78,7 @@ namespace llvm {

    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
                              const SPUTargetMachine &TM);
-    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSplat_v2i64(MVT OpVT, SelectionDAG &DAG, uint64_t splat);

    SDValue getBorrowGenerateShufMask(SelectionDAG &DAG);
    SDValue getCarryGenerateShufMask(SelectionDAG &DAG);
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@ -155,13 +155,13 @@ SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
  case SPU::ORr8_r32:
  case SPU::ORr32_r16:
  case SPU::ORr32_r8:
-  case SPU::ORr32_r64:
  case SPU::ORr16_r64:
  case SPU::ORr8_r64:
-  case SPU::ORr64_r32:
  case SPU::ORr64_r16:
  case SPU::ORr64_r8:
 */
+  case SPU::ORr64_r32:
+  case SPU::ORr32_r64:
  case SPU::ORf32_r32:
  case SPU::ORr32_f32:
  case SPU::ORf64_r64:
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@ -1259,6 +1259,9 @@ multiclass BitwiseAnd
  def fabs32: ANDInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
                      [/* Intentionally does not match a pattern */]>;

+  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, VECREG:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
  // Could use v4i32, but won't for clarity
  def fabsvec: ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
                       [/* Intentionally does not match a pattern */]>;
@ -1525,17 +1528,17 @@ multiclass BitwiseOr
  // Conversion from R32C to register
  def r32_r16:   ORCvtFormR32Reg<R16C>;
  def r32_r8:    ORCvtFormR32Reg<R8C>;
-  
-  // Conversion from register to R64C:
-  def r32_r64:   ORCvtFormR64Reg<R32C>;
-  def r16_r64:   ORCvtFormR64Reg<R16C>;
-  def r8_r64:    ORCvtFormR64Reg<R8C>;
-  
-  // Conversion from R64C to register
-  def r64_r32:   ORCvtFormRegR64<R32C>;
-  def r64_r16:   ORCvtFormRegR64<R16C>;
-  def r64_r8:    ORCvtFormRegR64<R8C>;
 */
+  
+  // Conversion to register from R64C:
+  def r32_r64:   ORCvtFormR64Reg<R32C>;
+  // def r16_r64:   ORCvtFormR64Reg<R16C>;
+  // def r8_r64:    ORCvtFormR64Reg<R8C>;
+  
+  // Conversion to R64C from register
+  def r64_r32:   ORCvtFormRegR64<R32C>;
+  // def r64_r16:   ORCvtFormRegR64<R16C>;
+  // def r64_r8:    ORCvtFormRegR64<R8C>;

  // bitconvert patterns:
  def r32_f32:   ORCvtFormR32Reg<R32FP,
@ -1910,11 +1913,11 @@ class SELBInst<dag OOL, dag IOL, list<dag> pattern>:
    RRRForm<0b1000, OOL, IOL, "selb\t$rT, $rA, $rB, $rC",
            IntegerOp, pattern>;

-class SELBVecInst<ValueType vectype>:
+class SELBVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
           [(set (vectype VECREG:$rT),
                 (or (and (vectype VECREG:$rC), (vectype VECREG:$rB)),
-                     (and (vnot (vectype VECREG:$rC)),
+                     (and (vnot_frag (vectype VECREG:$rC)),
                          (vectype VECREG:$rA))))]>;

 class SELBVecVCondInst<ValueType vectype>:
@ -1947,7 +1950,7 @@ multiclass SelectBits
  def v16i8: SELBVecInst<v16i8>;
  def v8i16: SELBVecInst<v8i16>;
  def v4i32: SELBVecInst<v4i32>;
-  def v2i64: SELBVecInst<v2i64>;
+  def v2i64: SELBVecInst<v2i64, vnot_conv>;

  def r128:  SELBRegInst<GPRC>;
  def r64:   SELBRegInst<R64C>;
@ -4321,6 +4324,13 @@ def : Pat<(fabs (v4f32 VECREG:$rA)),
          (ANDfabsvec (v4f32 VECREG:$rA),
                      (v4f32 (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f)))>;

+def : Pat<(fabs R64FP:$rA),
+          (ANDfabs64 R64FP:$rA, (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f))>;
+
+def : Pat<(fabs (v2f64 VECREG:$rA)),
+          (ANDfabsvec (v2f64 VECREG:$rA),
+                      (v2f64 (ANDBIv16i8 (FSMBIv16i8 0xffff), 0x7f)))>;
+
 //===----------------------------------------------------------------------===//
 // Hint for branch instructions:
 //===----------------------------------------------------------------------===//
--- a/test/CodeGen/CellSPU/fcmp32.ll
+++ b/test/CodeGen/CellSPU/fcmp32.ll
@ -1,22 +1,23 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
 ; RUN: grep fceq  %t1.s | count 1
 ; RUN: grep fcmeq %t1.s | count 1
-;
-; This file includes standard floating point arithmetic instructions
+
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"

+; Exercise the floating point comparison operators for f32:
+
 declare double @fabs(double)
 declare float @fabsf(float)

 define i1 @fcmp_eq(float %arg1, float %arg2) {
-        %A = fcmp oeq float %arg1,  %arg2       ; <float> [#uses=1]
+        %A = fcmp oeq float %arg1,  %arg2
        ret i1 %A
 }

 define i1 @fcmp_mag_eq(float %arg1, float %arg2) {
-        %A = call float @fabsf(float %arg1)     ; <float> [#uses=1]
-        %B = call float @fabsf(float %arg2)     ; <float> [#uses=1]
-        %C = fcmp oeq float %A,  %B     ; <float> [#uses=1]
-        ret i1 %C
+        %1 = call float @fabsf(float %arg1)
+        %2 = call float @fabsf(float %arg2)
+        %3 = fcmp oeq float %1, %2
+        ret i1 %3
 }
--- a/test/CodeGen/CellSPU/fcmp64.ll
+++ b/test/CodeGen/CellSPU/fcmp64.ll
@ -0,0 +1,7 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+
+define i1 @fcmp_eq_setcc_f64(double %arg1, double %arg2) nounwind {
+entry:
+       %A = fcmp oeq double %arg1, %arg2
+       ret i1 %A
+}
--- a/test/CodeGen/CellSPU/fneg-fabs.ll
+++ b/test/CodeGen/CellSPU/fneg-fabs.ll
@ -1,9 +1,10 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep fsmbi   %t1.s | count 2
+; RUN: grep fsmbi   %t1.s | count 3
 ; RUN: grep 32768   %t1.s | count 2
 ; RUN: grep xor     %t1.s | count 4
-; RUN: grep and     %t1.s | count 4
-; RUN: grep andbi   %t1.s | count 2
+; RUN: grep and     %t1.s | count 5
+; RUN: grep andbi   %t1.s | count 3
+
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"

@ -33,11 +34,11 @@ declare double @fabs(double)
 declare float @fabsf(float)

 define double @fabs_dp(double %X) {
-        %Y = call double @fabs( double %X )             ; <double> [#uses=1]
+        %Y = call double @fabs( double %X )
        ret double %Y
 }

 define float @fabs_sp(float %X) {
-        %Y = call float @fabsf( float %X )              ; <float> [#uses=1]
+        %Y = call float @fabsf( float %X )
        ret float %Y
 }
--- a/test/CodeGen/CellSPU/select_bits.ll
+++ b/test/CodeGen/CellSPU/select_bits.ll
@ -1,5 +1,5 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep selb   %t1.s | count 280
+; RUN: grep selb   %t1.s | count 56

 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
 target triple = "spu"
@ -9,7 +9,7 @@ target triple = "spu"
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define <2 x i64> @selb_v2i64_01(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_01(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %C = and <2 x i64> %rC, %rB
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %A, %rA
@ -18,7 +18,7 @@ define <2 x i64> @selb_v2i64_01(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define <2 x i64> @selb_v2i64_02(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_02(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %C = and <2 x i64> %rB, %rC
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %A, %rA
@ -27,7 +27,7 @@ define <2 x i64> @selb_v2i64_02(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define <2 x i64> @selb_v2i64_03(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_03(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %A, %rA
        %C = and <2 x i64> %rB, %rC
@ -36,7 +36,7 @@ define <2 x i64> @selb_v2i64_03(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define <2 x i64> @selb_v2i64_04(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_04(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %A, %rA
        %C = and <2 x i64> %rC, %rB
@ -45,7 +45,7 @@ define <2 x i64> @selb_v2i64_04(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define <2 x i64> @selb_v2i64_05(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_05(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %C = and <2 x i64> %rC, %rB
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %rA, %A
@ -54,7 +54,7 @@ define <2 x i64> @selb_v2i64_05(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define <2 x i64> @selb_v2i64_06(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_06(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %C = and <2 x i64> %rB, %rC
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %rA, %A
@ -63,7 +63,7 @@ define <2 x i64> @selb_v2i64_06(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define <2 x i64> @selb_v2i64_07(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_07(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %rA, %A
        %C = and <2 x i64> %rB, %rC
@ -72,7 +72,7 @@ define <2 x i64> @selb_v2i64_07(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define <2 x i64> @selb_v2i64_08(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
+define <2 x i64> @selectbits_v2i64_08(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
        %A = xor <2 x i64> %rC, < i64 -1, i64 -1 >
        %B = and <2 x i64> %rA, %A
        %C = and <2 x i64> %rC, %rB
@ -85,7 +85,7 @@ define <2 x i64> @selb_v2i64_08(<2 x i64> %rA, <2 x i64> %rB, <2 x i64> %rC) {
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define <4 x i32> @selb_v4i32_01(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_01(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %C = and <4 x i32> %rC, %rB
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1 >
        %B = and <4 x i32> %A, %rA
@ -94,7 +94,7 @@ define <4 x i32> @selb_v4i32_01(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define <4 x i32> @selb_v4i32_02(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_02(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %C = and <4 x i32> %rB, %rC
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1 >
        %B = and <4 x i32> %A, %rA
@ -103,7 +103,7 @@ define <4 x i32> @selb_v4i32_02(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define <4 x i32> @selb_v4i32_03(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_03(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1 >
        %B = and <4 x i32> %A, %rA
        %C = and <4 x i32> %rB, %rC
@ -112,7 +112,7 @@ define <4 x i32> @selb_v4i32_03(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define <4 x i32> @selb_v4i32_04(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_04(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1>
        %B = and <4 x i32> %A, %rA
        %C = and <4 x i32> %rC, %rB
@ -121,7 +121,7 @@ define <4 x i32> @selb_v4i32_04(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define <4 x i32> @selb_v4i32_05(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_05(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %C = and <4 x i32> %rC, %rB
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1>
        %B = and <4 x i32> %rA, %A
@ -130,7 +130,7 @@ define <4 x i32> @selb_v4i32_05(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define <4 x i32> @selb_v4i32_06(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_06(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %C = and <4 x i32> %rB, %rC
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1>
        %B = and <4 x i32> %rA, %A
@ -139,7 +139,7 @@ define <4 x i32> @selb_v4i32_06(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define <4 x i32> @selb_v4i32_07(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_07(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1>
        %B = and <4 x i32> %rA, %A
        %C = and <4 x i32> %rB, %rC
@ -148,7 +148,7 @@ define <4 x i32> @selb_v4i32_07(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define <4 x i32> @selb_v4i32_08(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
+define <4 x i32> @selectbits_v4i32_08(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
        %A = xor <4 x i32> %rC, < i32 -1, i32 -1, i32 -1, i32 -1>
        %B = and <4 x i32> %rA, %A
        %C = and <4 x i32> %rC, %rB
@ -161,7 +161,7 @@ define <4 x i32> @selb_v4i32_08(<4 x i32> %rA, <4 x i32> %rB, <4 x i32> %rC) {
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define <8 x i16> @selb_v8i16_01(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_01(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %C = and <8 x i16> %rC, %rB
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
@ -171,7 +171,7 @@ define <8 x i16> @selb_v8i16_01(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define <8 x i16> @selb_v8i16_02(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_02(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %C = and <8 x i16> %rB, %rC
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
@ -181,7 +181,7 @@ define <8 x i16> @selb_v8i16_02(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define <8 x i16> @selb_v8i16_03(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_03(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
        %B = and <8 x i16> %A, %rA
@ -191,7 +191,7 @@ define <8 x i16> @selb_v8i16_03(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define <8 x i16> @selb_v8i16_04(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_04(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
        %B = and <8 x i16> %A, %rA
@ -201,7 +201,7 @@ define <8 x i16> @selb_v8i16_04(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define <8 x i16> @selb_v8i16_05(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_05(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %C = and <8 x i16> %rC, %rB
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
@ -211,7 +211,7 @@ define <8 x i16> @selb_v8i16_05(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define <8 x i16> @selb_v8i16_06(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_06(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %C = and <8 x i16> %rB, %rC
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
@ -221,7 +221,7 @@ define <8 x i16> @selb_v8i16_06(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define <8 x i16> @selb_v8i16_07(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_07(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
        %B = and <8 x i16> %rA, %A
@ -231,7 +231,7 @@ define <8 x i16> @selb_v8i16_07(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define <8 x i16> @selb_v8i16_08(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
+define <8 x i16> @selectbits_v8i16_08(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
        %A = xor <8 x i16> %rC, < i16 -1, i16 -1, i16 -1, i16 -1,
                                  i16 -1, i16 -1, i16 -1, i16 -1 >
        %B = and <8 x i16> %rA, %A
@ -245,7 +245,7 @@ define <8 x i16> @selb_v8i16_08(<8 x i16> %rA, <8 x i16> %rB, <8 x i16> %rC) {
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define <16 x i8> @selb_v16i8_01(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_01(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %C = and <16 x i8> %rC, %rB
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -257,7 +257,7 @@ define <16 x i8> @selb_v16i8_01(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define <16 x i8> @selb_v16i8_02(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_02(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %C = and <16 x i8> %rB, %rC
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -269,7 +269,7 @@ define <16 x i8> @selb_v16i8_02(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define <16 x i8> @selb_v16i8_03(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_03(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -281,7 +281,7 @@ define <16 x i8> @selb_v16i8_03(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define <16 x i8> @selb_v16i8_04(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_04(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -293,7 +293,7 @@ define <16 x i8> @selb_v16i8_04(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define <16 x i8> @selb_v16i8_05(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_05(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %C = and <16 x i8> %rC, %rB
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -305,7 +305,7 @@ define <16 x i8> @selb_v16i8_05(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define <16 x i8> @selb_v16i8_06(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_06(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %C = and <16 x i8> %rB, %rC
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -317,7 +317,7 @@ define <16 x i8> @selb_v16i8_06(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define <16 x i8> @selb_v16i8_07(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_07(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -329,7 +329,7 @@ define <16 x i8> @selb_v16i8_07(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define <16 x i8> @selb_v16i8_08(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
+define <16 x i8> @selectbits_v16i8_08(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
        %A = xor <16 x i8> %rC, < i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
                                  i8 -1, i8 -1, i8 -1, i8 -1,
@ -345,7 +345,7 @@ define <16 x i8> @selb_v16i8_08(<16 x i8> %rA, <16 x i8> %rB, <16 x i8> %rC) {
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define i32 @selb_i32_01(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_01(i32 %rA, i32 %rB, i32 %rC) {
        %C = and i32 %rC, %rB
        %A = xor i32 %rC, -1
        %B = and i32 %A, %rA
@ -354,7 +354,7 @@ define i32 @selb_i32_01(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define i32 @selb_i32_02(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_02(i32 %rA, i32 %rB, i32 %rC) {
        %C = and i32 %rB, %rC
        %A = xor i32 %rC, -1
        %B = and i32 %A, %rA
@ -363,7 +363,7 @@ define i32 @selb_i32_02(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define i32 @selb_i32_03(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_03(i32 %rA, i32 %rB, i32 %rC) {
        %A = xor i32 %rC, -1
        %B = and i32 %A, %rA
        %C = and i32 %rB, %rC
@ -372,7 +372,7 @@ define i32 @selb_i32_03(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define i32 @selb_i32_04(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_04(i32 %rA, i32 %rB, i32 %rC) {
        %A = xor i32 %rC, -1
        %B = and i32 %A, %rA
        %C = and i32 %rC, %rB
@ -381,7 +381,7 @@ define i32 @selb_i32_04(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define i32 @selb_i32_05(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_05(i32 %rA, i32 %rB, i32 %rC) {
        %C = and i32 %rC, %rB
        %A = xor i32 %rC, -1
        %B = and i32 %rA, %A
@ -390,7 +390,7 @@ define i32 @selb_i32_05(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define i32 @selb_i32_06(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_06(i32 %rA, i32 %rB, i32 %rC) {
        %C = and i32 %rB, %rC
        %A = xor i32 %rC, -1
        %B = and i32 %rA, %A
@ -399,7 +399,7 @@ define i32 @selb_i32_06(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define i32 @selb_i32_07(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_07(i32 %rA, i32 %rB, i32 %rC) {
        %A = xor i32 %rC, -1
        %B = and i32 %rA, %A
        %C = and i32 %rB, %rC
@ -408,7 +408,7 @@ define i32 @selb_i32_07(i32 %rA, i32 %rB, i32 %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define i32 @selb_i32_08(i32 %rA, i32 %rB, i32 %rC) {
+define i32 @selectbits_i32_08(i32 %rA, i32 %rB, i32 %rC) {
        %A = xor i32 %rC, -1
        %B = and i32 %rA, %A
        %C = and i32 %rC, %rB
@ -421,7 +421,7 @@ define i32 @selb_i32_08(i32 %rA, i32 %rB, i32 %rC) {
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define i16 @selb_i16_01(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_01(i16 %rA, i16 %rB, i16 %rC) {
        %C = and i16 %rC, %rB
        %A = xor i16 %rC, -1
        %B = and i16 %A, %rA
@ -430,7 +430,7 @@ define i16 @selb_i16_01(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define i16 @selb_i16_02(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_02(i16 %rA, i16 %rB, i16 %rC) {
        %C = and i16 %rB, %rC
        %A = xor i16 %rC, -1
        %B = and i16 %A, %rA
@ -439,7 +439,7 @@ define i16 @selb_i16_02(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define i16 @selb_i16_03(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_03(i16 %rA, i16 %rB, i16 %rC) {
        %A = xor i16 %rC, -1
        %B = and i16 %A, %rA
        %C = and i16 %rB, %rC
@ -448,7 +448,7 @@ define i16 @selb_i16_03(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define i16 @selb_i16_04(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_04(i16 %rA, i16 %rB, i16 %rC) {
        %A = xor i16 %rC, -1
        %B = and i16 %A, %rA
        %C = and i16 %rC, %rB
@ -457,7 +457,7 @@ define i16 @selb_i16_04(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define i16 @selb_i16_05(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_05(i16 %rA, i16 %rB, i16 %rC) {
        %C = and i16 %rC, %rB
        %A = xor i16 %rC, -1
        %B = and i16 %rA, %A
@ -466,7 +466,7 @@ define i16 @selb_i16_05(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define i16 @selb_i16_06(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_06(i16 %rA, i16 %rB, i16 %rC) {
        %C = and i16 %rB, %rC
        %A = xor i16 %rC, -1
        %B = and i16 %rA, %A
@ -475,7 +475,7 @@ define i16 @selb_i16_06(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define i16 @selb_i16_07(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_07(i16 %rA, i16 %rB, i16 %rC) {
        %A = xor i16 %rC, -1
        %B = and i16 %rA, %A
        %C = and i16 %rB, %rC
@ -484,7 +484,7 @@ define i16 @selb_i16_07(i16 %rA, i16 %rB, i16 %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define i16 @selb_i16_08(i16 %rA, i16 %rB, i16 %rC) {
+define i16 @selectbits_i16_08(i16 %rA, i16 %rB, i16 %rC) {
        %A = xor i16 %rC, -1
        %B = and i16 %rA, %A
        %C = and i16 %rC, %rB
@ -497,7 +497,7 @@ define i16 @selb_i16_08(i16 %rA, i16 %rB, i16 %rC) {
 ;-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~

 ; (or (and rC, rB), (and (not rC), rA))
-define i8 @selb_i8_01(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_01(i8 %rA, i8 %rB, i8 %rC) {
        %C = and i8 %rC, %rB
        %A = xor i8 %rC, -1
        %B = and i8 %A, %rA
@ -506,7 +506,7 @@ define i8 @selb_i8_01(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and rB, rC), (and (not rC), rA))
-define i8 @selb_i8_02(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_02(i8 %rA, i8 %rB, i8 %rC) {
        %C = and i8 %rB, %rC
        %A = xor i8 %rC, -1
        %B = and i8 %A, %rA
@ -515,7 +515,7 @@ define i8 @selb_i8_02(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and (not rC), rA), (and rB, rC))
-define i8 @selb_i8_03(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_03(i8 %rA, i8 %rB, i8 %rC) {
        %A = xor i8 %rC, -1
        %B = and i8 %A, %rA
        %C = and i8 %rB, %rC
@ -524,7 +524,7 @@ define i8 @selb_i8_03(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and (not rC), rA), (and rC, rB))
-define i8 @selb_i8_04(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_04(i8 %rA, i8 %rB, i8 %rC) {
        %A = xor i8 %rC, -1
        %B = and i8 %A, %rA
        %C = and i8 %rC, %rB
@ -533,7 +533,7 @@ define i8 @selb_i8_04(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and rC, rB), (and rA, (not rC)))
-define i8 @selb_i8_05(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_05(i8 %rA, i8 %rB, i8 %rC) {
        %C = and i8 %rC, %rB
        %A = xor i8 %rC, -1
        %B = and i8 %rA, %A
@ -542,7 +542,7 @@ define i8 @selb_i8_05(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and rB, rC), (and rA, (not rC)))
-define i8 @selb_i8_06(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_06(i8 %rA, i8 %rB, i8 %rC) {
        %C = and i8 %rB, %rC
        %A = xor i8 %rC, -1
        %B = and i8 %rA, %A
@ -551,7 +551,7 @@ define i8 @selb_i8_06(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and rA, (not rC)), (and rB, rC))
-define i8 @selb_i8_07(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_07(i8 %rA, i8 %rB, i8 %rC) {
        %A = xor i8 %rC, -1
        %B = and i8 %rA, %A
        %C = and i8 %rB, %rC
@ -560,7 +560,7 @@ define i8 @selb_i8_07(i8 %rA, i8 %rB, i8 %rC) {
 }

 ; (or (and rA, (not rC)), (and rC, rB))
-define i8 @selb_i8_08(i8 %rA, i8 %rB, i8 %rC) {
+define i8 @selectbits_i8_08(i8 %rA, i8 %rB, i8 %rC) {
        %A = xor i8 %rC, -1
        %B = and i8 %rA, %A
        %C = and i8 %rC, %rB
--- a/test/CodeGen/CellSPU/shift_ops.ll
+++ b/test/CodeGen/CellSPU/shift_ops.ll
@ -275,3 +275,9 @@ define i64 @ashr_i64_3(i64 %arg1, i32 %shift) {
 	%2 = ashr i64 %arg1, %1
 	ret i64 %2
 }
+
+define i32 @hi32_i64(i64 %arg) {
+	%1 = lshr i64 %arg, 32
+	%2 = trunc i64 %1 to i32
+	ret i32 %2
+}